tensorflow学习(五)基于CNN的验证码识别

验证码的识别是OCR中的一个重要内容,特别是对于爬虫系统意义重大,传统方法都是单个字符识别,包括二值化处理、字符分割、字符识别的过程。由于现在深度学习的火爆,end-to-end的方法流行起来。这里就用tensorflow实现一个端到端的CNN验证码识别功能。

本文的想法是把验证码看成一个多标签学习的问题,相当于几个有标签的图像识别。这里没有考虑使用LSTM对验证码序列进行学习,因为我个人觉得验证码字符之间的相关性不强,没必要用这种大杀器。

验证码数据集

python-captcha生成验证码数据集,这个库可以生成声音和图像的验证码,安装过程非常简单

pip install captcha

使用示例如下:

from captcha.image import ImageCaptcha

image = ImageCaptcha(fonts=['/path/A.ttf', '/path/B.ttf'])
image.write('1234', 'out.png')

数据输入

验证码图片生成后是通过字节流生成的,可以随机生成,所以训练样本量可以无穷大。我们这里使用定长的4位验证码,包含0~9、a~z、A~Z。图像大小缩放到(30, 80, 3)。代码如下:

import numpy as np
import cv2
import random
import tensorflow as tf
from captcha.image import ImageCaptcha

class OCR_data(object):
    def __init__(self, num, data_dir, batch_size=50, len_code=4, height=30, width=80):
        self.num = num
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.len_code = len_code
        self.height = height
        self.width = width
        self.captcha = ImageCaptcha()
        self.index_in_epoch = 0
        self._imgs = []
        self._labels = []
        for i in range(self.num):
            if i % 100 == 0:
                print '%s images have been created.' % i
            img, label = self.create_captcha()
            self._imgs.append(img)
            self._labels.append(label)
        self._imgs = np.array(self._imgs)
        self._labels = np.array(self._labels)


    def create_captcha(self):
        code, label = self.gen_rand()
        img = self.captcha.generate(code)
        img = np.fromstring(img.getvalue(), dtype='uint8')
        img = cv2.imdecode(img, cv2.IMREAD_COLOR)
        img = cv2.resize(img, (self.width, self.height))
        return (img, label)

    def gen_rand(self):
        buf = ''
        label = []
        for i in range(self.len_code):
            rnd = random.randint(0, 61)
            label.append(rnd)
            if rnd < 10:
                ascii_code = chr(rnd+48)
            elif rnd < 36:
                ascii_code = chr(rnd+65)
            else:
                ascii_code = chr(rnd+97)
            buf += ascii_code
        label_one_hot = self.dense_to_one_hot(label, 62)
        return buf, label_one_hot

    def dense_to_one_hot(self, labels_dense, num_classes):
        num_labels = len(labels_dense)
        index_offest = np.arange(num_labels) * num_classes
        labels_one_hot = np.zeros((num_labels, num_classes))
        labels_one_hot.flat[index_offest + labels_dense] = 1
        labels_one_hot = labels_one_hot.reshape(num_labels*num_classes)
        return labels_one_hot

    def next_batch(self, batch_size):
        start = self.index_in_epoch
        self.index_in_epoch += batch_size
        if self.index_in_epoch > self.num:
            perm = np.arange(self.num)
            np.random.shuffle(perm)
            self._imgs = self._imgs[perm]
            self._labels = self._labels[perm]
            start = 0
            self.index_in_epoch = batch_size
            assert batch_size <= self.num
        end = self.index_in_epoch
        return self._imgs[start:end], self._labels[start:end]

训练

import tensorflow as tf
from captcha_data import OCR_data
# Parameters
learning_rate = 0.001
training_iters = 200000
batch_size = 64
display_step = 20

# Network Parameters
# n_input = 7200  # 30*80*3
n_classes = 62  # 10+26+26

data_train = OCR_data(1000, '/data/captcha_data')
data_test = OCR_data(500, '/data/captcha_data')

# tf Graph input
x = tf.placeholder(tf.float32, [None, 30, 80, 3])
y = tf.placeholder(tf.float32, [None, 4*n_classes])

def print_activations(t):
    print(t.op.name, t.get_shape().as_list())

def weight_variable(shape):
    initial = tf.truncated_normal(shape, dtype=tf.float32, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.0, shape=shape)
    return tf.Variable(initial, trainable=True)

def conv2d(x, W, B, name):
    with tf.name_scope(name) as scope:
        conv = tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
        bias = tf.nn.bias_add(conv, B)
        conv = tf.nn.relu(bias, name=scope)
        return conv

def max_pool(x, k, name):
    return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)

def avg_pool(x, k, name):
    return tf.nn.avg_pool(x, ksize=[1, k, k, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)

def norm(x, lsize, name):
    return tf.nn.lrn(x, lsize, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name=name)

weights = {
    'wc1': weight_variable([5, 5, 3, 32]),
    'wc2': weight_variable([5, 5, 32, 32]),
    'wc3': weight_variable([3, 3, 32, 32]),
    'wd1': weight_variable([4*10*32, 512]),
    'out1': weight_variable([512, n_classes]),
    'out2': weight_variable([512, n_classes]),
    'out3': weight_variable([512, n_classes]),
    'out4': weight_variable([512, n_classes])
}
biases = {
    'bc1': bias_variable([32]),
    'bc2': bias_variable([32]),
    'bc3': bias_variable([32]),
    'bd1': bias_variable([512]),
    'out1': bias_variable([n_classes]),
    'out2': bias_variable([n_classes]),
    'out3': bias_variable([n_classes]),
    'out4': bias_variable([n_classes]),
}

def ocr_net(_x, _weights, _biases):
    _x = tf.reshape(_x, shape=[-1, 30, 80, 3])

    conv1 = conv2d(_x, _weights['wc1'], _biases['bc1'], 'conv1')
    print_activations(conv1)
    pool1 = max_pool(conv1, k=2, name='pool1')
    print_activations(pool1)

    conv2 = conv2d(pool1, _weights['wc2'], _biases['bc2'], 'conv2')
    print_activations(conv2)
    pool2 = avg_pool(conv2, k=2, name='pool2')
    print_activations(pool2)

    conv3 = conv2d(pool2, _weights['wc3'], _biases['bc3'], 'conv3')
    print_activations(conv3)
    pool3 = avg_pool(conv3, k=2, name='pool3')
    print_activations(pool3)

    pool3_flat = tf.reshape(pool3, [-1, _weights['wd1'].get_shape().as_list()[0]])
    fc1 = tf.nn.relu(tf.matmul(pool3_flat, _weights['wd1']) + _biases['bd1'], name='fc1')
    print_activations(fc1)

    fc21 = tf.nn.relu(tf.matmul(fc1, _weights['out1']) + _biases['out1'], name='fc21')
    print_activations(fc21)

    fc22 = tf.nn.relu(tf.matmul(fc1, _weights['out2']) + _biases['out2'], name='fc22')
    print_activations(fc22)

    fc23 = tf.nn.relu(tf.matmul(fc1, _weights['out3']) + _biases['out3'], name='fc23')
    print_activations(fc23)

    fc24 = tf.nn.relu(tf.matmul(fc1, _weights['out4']) + _biases['out4'], name='fc24')
    print_activations(fc24)

    out = tf.concat(axis=1, values=[fc21, fc22, fc23, fc24], name='out')
    print_activations(out)
    return out

def accuracy_func(_pred, _y):
    y = tf.reshape(_y, shape=[-1, 4, 62])
    pred = tf.reshape(_pred, shape=[-1, 4, 62])
    correct_pred = tf.equal(tf.argmax(pred,2), tf.argmax(y,2))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    return accuracy

pred = ocr_net(x, weights, biases)

cost = -tf.reduce_mean(y*tf.log(pred))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

#correct_pred = tf.equal(tf.argmax(pred,2), tf.argmax(y,2))
#accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
accuracy = accuracy_func(pred, y)

init = tf.global_variables_initializer()

config = tf.ConfigProto()
config.gpu_options.allow_growth = True

with tf.Session(config=config) as sess:
    sess.run(init)
    step = 1# Keep training until reach max iterations
    while step * batch_size < training_iters:
        batch = data_train.next_batch(batch_size)
        # Fit training using batch data
        sess.run(optimizer, feed_dict={x: batch[0], y: batch[1]})
        if step % display_step == 0:
            # Calculate batch accuracy
            acc = sess.run(accuracy, feed_dict={x: batch[0], y: batch[1]})
            # Calculate batch loss
            loss = sess.run(cost, feed_dict={x: batch[0], y: batch[1]})
            print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)
        step += 1
    print "Optimization Finished!"

    test_batch = data_test.next_batch(500)
    print "Testing Accuracy:", sess.run(accuracy, feed_dict={x: test_batch[0], y: test_batch[1]})

预测

tbd