add verify code

pull/251/merge
文贤平 2019-06-11 19:22:45 +08:00
parent bd30d7e457
commit d0d552a5af
4 changed files with 267 additions and 0 deletions

BIN
uml/REIL_DEVICEID.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 380 KiB

75
verify/localVerifyCode.py Normal file
View File

@ -0,0 +1,75 @@
# coding: utf-8
import base64
import cv2
import numpy as np
from keras import models
from verify import pretreatment
from verify.mlearn_for_image import preprocess_input
def get_text(img, offset=0):
text = pretreatment.get_text(img, offset)
text = cv2.cvtColor(text, cv2.COLOR_BGR2GRAY)
text = text / 255.0
h, w = text.shape
text.shape = (1, h, w, 1)
return text
def base64_to_image(base64_code):
# base64解码
img_data = base64.b64decode(base64_code)
# 转换为np数组
img_array = np.fromstring(img_data, np.uint8)
# 转换成opencv可用格式
img = cv2.imdecode(img_array, cv2.COLOR_RGB2BGR)
return img
def verify(fn):
verify_titles = ['打字机', '调色板', '跑步机', '毛线', '老虎', '安全帽', '沙包', '盘子', '本子', '药片', '双面胶', '龙舟', '红酒', '拖把', '卷尺', '海苔', '红豆', '黑板', '热水袋', '烛台', '钟表', '路灯', '沙拉', '海报', '公交卡', '樱桃', '创可贴', '牌坊', '苍蝇拍', '高压锅', '电线', '网球拍', '海鸥', '风铃', '订书机', '冰箱', '话梅', '排风机', '锅铲', '绿豆', '航母', '电子秤', '红枣', '金字塔', '鞭炮', '菠萝', '开瓶器', '电饭煲', '仪表盘', '棉棒', '篮球', '狮子', '蚂蚁', '蜡烛', '茶盅', '印章', '茶几', '啤酒', '档案袋', '挂钟', '刺绣', '铃铛', '护腕', '手掌印', '锦旗', '文具盒', '辣椒酱', '耳塞', '中国结', '蜥蜴', '剪纸', '漏斗', '', '蒸笼', '珊瑚', '雨靴', '薯条', '蜜蜂', '日历', '口哨']
# 读取并预处理验证码
img = base64_to_image(fn)
text = get_text(img)
imgs = np.array(list(pretreatment._get_imgs(img)))
imgs = preprocess_input(imgs)
text_list = []
# 识别文字
model = models.load_model('model.v2.0.h5')
label = model.predict(text)
label = label.argmax()
text = verify_titles[label]
text_list.append(text)
# 获取下一个词
# 根据第一个词的长度来定位第二个词的位置
if len(text) == 1:
offset = 27
elif len(text) == 2:
offset = 47
else:
offset = 60
text = get_text(img, offset=offset)
if text.mean() < 0.95:
label = model.predict(text)
label = label.argmax()
text = verify_titles[label]
text_list.append(text)
print("题目为{}".format(text_list))
# 加载图片分类器
model = models.load_model('12306.image.model.h5')
labels = model.predict(imgs)
labels = labels.argmax(axis=1)
results = []
for pos, label in enumerate(labels):
l = verify_titles[label]
print(pos+1, l)
if l in text_list:
results.append(str(pos+1))
return results
if __name__ == '__main__':
verify("verify-img1.jpeg")

View File

@ -0,0 +1,92 @@
# coding: utf-8
import sys
import cv2
import numpy as np
from keras import models
from keras import layers
from keras import optimizers
from keras.applications import VGG16
from keras.callbacks import ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
def preprocess_input(x):
x = x.astype('float32')
# 我是用cv2来读取的图片其已经是BGR格式了
mean = [103.939, 116.779, 123.68]
x -= mean
return x
def load_data():
# 这是统计学专家提供的训练集
data = np.load('captcha.npz')
train_x, train_y = data['images'], data['labels']
train_x = preprocess_input(train_x)
# 由于是统计得来的信息,所以在此给定可信度
sample_weight = train_y.max(axis=1) / np.sqrt(train_y.sum(axis=1))
sample_weight /= sample_weight.mean()
train_y = train_y.argmax(axis=1)
# 这是人工提供的验证集
data = np.load('captcha.test.npz')
test_x, test_y = data['images'], data['labels']
test_x = preprocess_input(test_x)
return (train_x, train_y, sample_weight), (test_x, test_y)
def learn():
(train_x, train_y, sample_weight), (test_x, test_y) = load_data()
datagen = ImageDataGenerator(horizontal_flip=True,
vertical_flip=True)
train_generator = datagen.flow(train_x, train_y, sample_weight=sample_weight)
base = VGG16(weights='imagenet', include_top=False, input_shape=(None, None, 3))
for layer in base.layers[:-4]:
layer.trainable = False
model = models.Sequential([
base,
layers.BatchNormalization(),
layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
layers.GlobalAveragePooling2D(),
layers.BatchNormalization(),
layers.Dense(64, activation='relu'),
layers.BatchNormalization(),
layers.Dropout(0.20),
layers.Dense(80, activation='softmax')
])
model.compile(optimizer=optimizers.RMSprop(lr=1e-5),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
model.summary()
reduce_lr = ReduceLROnPlateau(verbose=1)
model.fit_generator(train_generator, epochs=400,
steps_per_epoch=100,
validation_data=(test_x[:800], test_y[:800]),
callbacks=[reduce_lr])
result = model.evaluate(test_x, test_y)
print(result)
model.save('12306.image.model.h5', include_optimizer=False)
def predict(imgs):
imgs = preprocess_input(imgs)
model = models.load_model('12306.image.model.h5')
labels = model.predict(imgs)
return labels
def _predict(fn):
imgs = cv2.imread(fn)
imgs = cv2.resize(imgs, (67, 67))
imgs.shape = (-1, 67, 67, 3)
labels = predict(imgs)
print(labels.max(axis=1))
print(labels.argmax(axis=1))
if __name__ == '__main__':
if len(sys.argv) >= 2:
_predict(sys.argv[1])
else:
learn()

100
verify/pretreatment.py Normal file
View File

@ -0,0 +1,100 @@
#! env python
# coding: utf-8
# 功能:对图像进行预处理,将文字部分单独提取出来
# 并存放到ocr目录下
# 文件名为原验证码文件的文件名
import hashlib
import os
import pathlib
import cv2
import numpy as np
import requests
import scipy.fftpack
PATH = 'imgs'
def download_image():
# 抓取验证码
# 存放到指定path下
# 文件名为图像的MD5
url = 'https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand'
r = requests.get(url)
fn = hashlib.md5(r.content).hexdigest()
with open(f'{PATH}/{fn}.jpg', 'wb') as fp:
fp.write(r.content)
def download_images():
pathlib.Path(PATH).mkdir(exist_ok=True)
for idx in range(40000):
download_image()
print(idx)
def get_text(img, offset=0):
# 得到图像中的文本部分
return img[3:22, 120 + offset:177 + offset]
def avhash(im):
im = cv2.resize(im, (8, 8), interpolation=cv2.INTER_CUBIC)
avg = im.mean()
im = im > avg
im = np.packbits(im)
return im
def phash(im):
im = cv2.resize(im, (32, 32), interpolation=cv2.INTER_CUBIC)
im = scipy.fftpack.dct(scipy.fftpack.dct(im, axis=0), axis=1)
im = im[:8, :8]
med = np.median(im)
im = im > med
im = np.packbits(im)
return im
def _get_imgs(img):
interval = 5
length = 67
for x in range(40, img.shape[0] - length, interval + length):
for y in range(interval, img.shape[1] - length, interval + length):
yield img[x:x + length, y:y + length]
def get_imgs(img):
imgs = []
for img in _get_imgs(img):
imgs.append(phash(img))
return imgs
def pretreat():
if not os.path.isdir(PATH):
download_images()
texts, imgs = [], []
for img in os.listdir(PATH):
img = os.path.join(PATH, img)
img = cv2.imread(img, cv2.IMREAD_GRAYSCALE)
texts.append(get_text(img))
imgs.append(get_imgs(img))
return texts, imgs
def load_data(path='data.npz'):
if not os.path.isfile(path):
texts, imgs = pretreat()
np.savez(path, texts=texts, images=imgs)
f = np.load(path)
return f['texts'], f['images']
if __name__ == '__main__':
texts, imgs = load_data()
print(texts.shape)
print(imgs.shape)
imgs = imgs.reshape(-1, 8)
print(np.unique(imgs, axis=0).shape)