验证码的识别是OCR中的一个重要内容,特别是对于爬虫系统意义重大,传统方法都是单个字符识别,包括二值化处理、字符分割、字符识别的过程。由于现在深度学习的火爆,end-to-end的方法流行起来。这里就用tensorflow实现一个端到端的CNN验证码识别功能。
本文的想法是把验证码看成一个多标签学习的问题,相当于几个有标签的图像识别。这里没有考虑使用LSTM对验证码序列进行学习,因为我个人觉得验证码字符之间的相关性不强,没必要用这种大杀器。
验证码数据集
用python-captcha生成验证码数据集,这个库可以生成声音和图像的验证码,安装过程非常简单
pip install captcha
使用示例如下:
from captcha.image import ImageCaptcha
image = ImageCaptcha(fonts=['/path/A.ttf', '/path/B.ttf'])
image.write('1234', 'out.png')
数据输入
验证码图片生成后是通过字节流生成的,可以随机生成,所以训练样本量可以无穷大。我们这里使用定长的4位验证码,包含0~9、a~z、A~Z。图像大小缩放到(30, 80, 3)。代码如下:
import numpy as np
import cv2
import random
import tensorflow as tf
from captcha.image import ImageCaptcha
class OCR_data(object):
def __init__(self, num, data_dir, batch_size=50, len_code=4, height=30, width=80):
self.num = num
self.data_dir = data_dir
self.batch_size = batch_size
self.len_code = len_code
self.height = height
self.width = width
self.captcha = ImageCaptcha()
self.index_in_epoch = 0
self._imgs = []
self._labels = []
for i in range(self.num):
if i % 100 == 0:
print '%s images have been created.' % i
img, label = self.create_captcha()
self._imgs.append(img)
self._labels.append(label)
self._imgs = np.array(self._imgs)
self._labels = np.array(self._labels)
def create_captcha(self):
code, label = self.gen_rand()
img = self.captcha.generate(code)
img = np.fromstring(img.getvalue(), dtype='uint8')
img = cv2.imdecode(img, cv2.IMREAD_COLOR)
img = cv2.resize(img, (self.width, self.height))
return (img, label)
def gen_rand(self):
buf = ''
label = []
for i in range(self.len_code):
rnd = random.randint(0, 61)
label.append(rnd)
if rnd < 10:
ascii_code = chr(rnd+48)
elif rnd < 36:
ascii_code = chr(rnd+65)
else:
ascii_code = chr(rnd+97)
buf += ascii_code
label_one_hot = self.dense_to_one_hot(label, 62)
return buf, label_one_hot
def dense_to_one_hot(self, labels_dense, num_classes):
num_labels = len(labels_dense)
index_offest = np.arange(num_labels) * num_classes
labels_one_hot = np.zeros((num_labels, num_classes))
labels_one_hot.flat[index_offest + labels_dense] = 1
labels_one_hot = labels_one_hot.reshape(num_labels*num_classes)
return labels_one_hot
def next_batch(self, batch_size):
start = self.index_in_epoch
self.index_in_epoch += batch_size
if self.index_in_epoch > self.num:
perm = np.arange(self.num)
np.random.shuffle(perm)
self._imgs = self._imgs[perm]
self._labels = self._labels[perm]
start = 0
self.index_in_epoch = batch_size
assert batch_size <= self.num
end = self.index_in_epoch
return self._imgs[start:end], self._labels[start:end]
训练
import tensorflow as tf
from captcha_data import OCR_data
# Parameters
learning_rate = 0.001
training_iters = 200000
batch_size = 64
display_step = 20
# Network Parameters
# n_input = 7200 # 30*80*3
n_classes = 62 # 10+26+26
data_train = OCR_data(1000, '/data/captcha_data')
data_test = OCR_data(500, '/data/captcha_data')
# tf Graph input
x = tf.placeholder(tf.float32, [None, 30, 80, 3])
y = tf.placeholder(tf.float32, [None, 4*n_classes])
def print_activations(t):
print(t.op.name, t.get_shape().as_list())
def weight_variable(shape):
initial = tf.truncated_normal(shape, dtype=tf.float32, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.0, shape=shape)
return tf.Variable(initial, trainable=True)
def conv2d(x, W, B, name):
with tf.name_scope(name) as scope:
conv = tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
bias = tf.nn.bias_add(conv, B)
conv = tf.nn.relu(bias, name=scope)
return conv
def max_pool(x, k, name):
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)
def avg_pool(x, k, name):
return tf.nn.avg_pool(x, ksize=[1, k, k, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)
def norm(x, lsize, name):
return tf.nn.lrn(x, lsize, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name=name)
weights = {
'wc1': weight_variable([5, 5, 3, 32]),
'wc2': weight_variable([5, 5, 32, 32]),
'wc3': weight_variable([3, 3, 32, 32]),
'wd1': weight_variable([4*10*32, 512]),
'out1': weight_variable([512, n_classes]),
'out2': weight_variable([512, n_classes]),
'out3': weight_variable([512, n_classes]),
'out4': weight_variable([512, n_classes])
}
biases = {
'bc1': bias_variable([32]),
'bc2': bias_variable([32]),
'bc3': bias_variable([32]),
'bd1': bias_variable([512]),
'out1': bias_variable([n_classes]),
'out2': bias_variable([n_classes]),
'out3': bias_variable([n_classes]),
'out4': bias_variable([n_classes]),
}
def ocr_net(_x, _weights, _biases):
_x = tf.reshape(_x, shape=[-1, 30, 80, 3])
conv1 = conv2d(_x, _weights['wc1'], _biases['bc1'], 'conv1')
print_activations(conv1)
pool1 = max_pool(conv1, k=2, name='pool1')
print_activations(pool1)
conv2 = conv2d(pool1, _weights['wc2'], _biases['bc2'], 'conv2')
print_activations(conv2)
pool2 = avg_pool(conv2, k=2, name='pool2')
print_activations(pool2)
conv3 = conv2d(pool2, _weights['wc3'], _biases['bc3'], 'conv3')
print_activations(conv3)
pool3 = avg_pool(conv3, k=2, name='pool3')
print_activations(pool3)
pool3_flat = tf.reshape(pool3, [-1, _weights['wd1'].get_shape().as_list()[0]])
fc1 = tf.nn.relu(tf.matmul(pool3_flat, _weights['wd1']) + _biases['bd1'], name='fc1')
print_activations(fc1)
fc21 = tf.nn.relu(tf.matmul(fc1, _weights['out1']) + _biases['out1'], name='fc21')
print_activations(fc21)
fc22 = tf.nn.relu(tf.matmul(fc1, _weights['out2']) + _biases['out2'], name='fc22')
print_activations(fc22)
fc23 = tf.nn.relu(tf.matmul(fc1, _weights['out3']) + _biases['out3'], name='fc23')
print_activations(fc23)
fc24 = tf.nn.relu(tf.matmul(fc1, _weights['out4']) + _biases['out4'], name='fc24')
print_activations(fc24)
out = tf.concat(axis=1, values=[fc21, fc22, fc23, fc24], name='out')
print_activations(out)
return out
def accuracy_func(_pred, _y):
y = tf.reshape(_y, shape=[-1, 4, 62])
pred = tf.reshape(_pred, shape=[-1, 4, 62])
correct_pred = tf.equal(tf.argmax(pred,2), tf.argmax(y,2))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
return accuracy
pred = ocr_net(x, weights, biases)
cost = -tf.reduce_mean(y*tf.log(pred))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
#correct_pred = tf.equal(tf.argmax(pred,2), tf.argmax(y,2))
#accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
accuracy = accuracy_func(pred, y)
init = tf.global_variables_initializer()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(init)
step = 1# Keep training until reach max iterations
while step * batch_size < training_iters:
batch = data_train.next_batch(batch_size)
# Fit training using batch data
sess.run(optimizer, feed_dict={x: batch[0], y: batch[1]})
if step % display_step == 0:
# Calculate batch accuracy
acc = sess.run(accuracy, feed_dict={x: batch[0], y: batch[1]})
# Calculate batch loss
loss = sess.run(cost, feed_dict={x: batch[0], y: batch[1]})
print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)
step += 1
print "Optimization Finished!"
test_batch = data_test.next_batch(500)
print "Testing Accuracy:", sess.run(accuracy, feed_dict={x: test_batch[0], y: test_batch[1]})
预测
tbd