2016-03-25

darknet学习（一）YOLO目标检测

darknet安装

darknet的安装还是非常简单的，依赖只有cuda和opencv，安装好之后执行以下命令即可安装编译成功

git clone https://github.com/pjreddie/darknet.git
cd darknet
vim Makefile
#################[modify Makefile]
GPU=1
OPENCV=1
OPTS=-Ofast  # 如果gcc版本过低，这里可能需要改成-O3
#################[close]
make

YOLO训练

准备训练数据

检测算法需要对图像中的目标区域进行标注，我这里实验只准备了5个类别的检测数据，我用python基于OpenCV做了一个简单的标注工具，可以得到矩形左上角、右下角的坐标位置，代码如下：

import os
import cv2
import numpy as np
import copy
import glob
import sys


#PT1 = (0,0)
#PT2 = (0,0)
def draw_box(event, x, y, flags, param):
    global PT1
    global PT2
    global img_copy
    if event == cv2.EVENT_LBUTTONDOWN and flags == cv2.EVENT_FLAG_LBUTTON:    
        PT1 = (x,y)
        PT2 = (x,y)
    if flags == cv2.EVENT_FLAG_LBUTTON:
        PT2 = (x,y)
    if event == cv2.EVENT_LBUTTONUP:
        PT2 = (x,y)
    cv2.rectangle(img_copy, PT1, PT2, (0,255,0), 2)
    cv2.imshow('image', img_copy)
    img_copy = copy.deepcopy(img)


if __name__ == '__main__':
    img_path = 'D:/test'
    img_list = glob.glob(os.path.join(img_path, '*.jpg'))
    img_list = sorted(img_list)
#    print img_list
    box_result_name = 'box_result.txt'
    img_processed = []
    try:
        with open(os.path.join(img_path, box_result_name), 'rb') as f_box:
            for line in f_box.readlines():
                line = line.strip().split(',')
                img_processed.append(line[0])
    except IOError:
        pass
    img_processed = set(img_processed)

    cv2.namedWindow('image')
    cv2.setMouseCallback('image', draw_box)

    f_box = open(os.path.join(img_path, box_result_name), 'a')
    for img_name in img_list:
        print img_name
        if img_name in img_processed:
            continue
        img = cv2.imread(img_name)
        img_copy = copy.deepcopy(img)
        PT1 = (0,0)
        PT2 = (0,0)

        while (1):
            key = cv2.waitKey(0)
            if key == 27:
                cv2.destroyAllWindows()
                f_box.close()
                sys.exit(0)
            elif key == 32:
                if PT1[0]==0 or PT1[1]==0 or PT2[0]==0 or PT2[1]==0 or abs(PT2[0]-PT1[0])<10 or abs(PT2[1]-PT1[1])<10:
                    continue
                if PT1[0] < PT2[0]:
                    x1 = PT1[0]
                    x2 = PT2[0]
                else:
                    x1 = PT2[0]
                    x2 = PT1[0]
                if PT1[1] < PT2[1]:
                    y1 = PT1[1]
                    y2 = PT2[1]
                else:
                    y1 = PT2[1]
                    y2 = PT1[1]
                f_box.write('%s,%d,%d,%d,%d\n' % (img_name, x1, y1, x2, y2))
                #print 'next image'
                break
            elif key == 122:
                f_box.close()
                f_box = open(os.path.join(img_path, box_result_name), 'rb')
                lines = f_box.readlines()
                f_box.close()
                curr = lines[:-1]
                f_box = open(os.path.join(img_path, box_result_name), 'w')
                for line in lines:
                    line = line.strip()
                    f_box.write('%s' % line)
                print 'delete last image box'
                cv2.destroyAllWindows()
                f_box.close()
                sys.exit(0)
        else:
            break
    cv2.destroyAllWindows()
    f_box.close()

这个标注工具代码比较繁琐，不过还算鲁棒，可以有效处理误标注并实现自动跳转下一张图的功能。每一个待检测的类别用这个标注工具可以生成一个对应的文本文件说明每一张图的矩形坐标位置。

生成用于darknet的标注

darknet要求每一张图片a.jpg对应一个a.txt，文本里面一行信息说明groundtruth的类别和图像原始宽高的相对坐标，如下：

# <object-class> <x> <y> <width> <height>
1 0.526875 0.499375 0.30875 0.80375

我基于darknet自带的scripts\voc_label.py写了一个脚本把图像以及标注工具生成的矩形坐标转换成满足darknet的标注信息

import pickle
import os
from os import listdir, getcwd
from os.path import join

sets = ['men_jacket', 'men_bottem', 'underwear', 'women_bottem', 'women_jacket']
classes = ['men_jacket', 'men_bottem', 'underwear', 'women_bottem', 'women_jacket']


def convert(size, box):
    dw = 1./size[0]
    dh = 1./size[1]
    x = (box[0] + box[2])/2.0
    y = (box[1] + box[3])/2.0
    w = box[2] - box[0]
    h = box[3] - box[1]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x,y,w,h)

def convert_annotation(line, image_id, set_id):
    out_file = open('box_labels/%s/%s.txt'%(set_id, image_id), 'w')

    w = 800
    h = 800
    cls = line[0].split('/')[1]
    if cls not in classes:
        return
    cls_id = classes.index(cls)

    b = (float(line[1]), float(line[2]), float(line[3]), float(line[4]))
    bb = convert((w,h), b)
    out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
    out_file.close

wd = getcwd()
list_file = open('train.txt', 'w')
for set_id in sets:
    if not os.path.exists('box_labels/%s' % (set_id)):
        os.makedirs('box_labels/%s' % (set_id))
    images_info = open('box_imgs/%s.txt' % (set_id)).readlines()
    for line in images_info:
        line = line.strip().split(',')
        image_id = line[0].split('/')[2].split('.')[0]
        list_file.write('%s/box_imgs/%s/%s.jpg\n' % (wd, set_id, image_id))
        convert_annotation(line, image_id, set_id)
list_file.close()

执行这个脚本会生成一个box_labels文件夹，并在里面生成对应的darknet标签信息。同时生成一个train.txt说明用于训练的图片的路径。

更改YOLO模型参数

对cfg/yolo.cfg的配置文件进行调整

subdivisions=2 # 原来为64，改小可以提高训练速度，同时增加显存使用
output= 735 # 最后connected层，原来值为1470
classes = 5 # detection层，这里设置为5

这个参数的确定由公式output = S x S x (5*B+C)，其中S=7，B=2, C=5（5是我实验使用的类别数）。

改写src/data.c中的fill_truth_region函数，确定图像和标签数据的加载位置

//    char *labelpath = find_replace(path, "images", "labels");
    char *labelpath = find_replace(path, "box_imgs", "box_labels");

改写src/yolo.c：

//char *voc_names[] = {"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"};
//image voc_labels[20];
char *voc_names[] = {"men_jacket", "men_bottem", "underwear", "women_bottem", "women_jacket"};
image voc_labels[5];
void train_yolo(char *cfgfile, char *weightfile)
{
//    char *train_images = "/data/voc/train.txt";
//    char *backup_directory = "/home/pjreddie/backup/";
    char *train_images = "/home/xixi/darknet/train.txt";
    char *backup_directory = "/home/xixi/darknet/results/";

...

void run_yolo(int argc, char **argv)
{
    /*
    int i;
    for(i = 0; i < 20; ++i){
        char buff[256];
        sprintf(buff, "data/labels/%s.png", voc_names[i]);
        voc_labels[i] = load_image_color(buff, 0, 0);
    }
    */

改好之后重新编译代码，下载预训练好的模型参数文件，并开始训练

make
wget http://pjreddie.com/media/files/extraction.conv.weights
./darknet -i 2 yolo train cfg/yolo.cfg extraction.conv.weights

在results文件夹下面就好生成训练好的模型文件，当迭代40000次后，会输出最终模型文件yolo_final.weights。这个过程用GPU大概需要3天时间，消耗显存6个G。（-i参数表示使用第二块GPU）

YOLO检测

# 检测单张图片
./darknet yolo test cfg/yolo.cfg results/yolo_final.weights <image>
# 检测多张图片
./darknet yolo test cfg/yolo.cfg results/yolo_final.weights
# 改变检测的阈值
./darknet yolo test cfg/yolo.cfg results/yolo_final.weights <image> -thresh 0
# 改为CPU模式检测
./darknet -nogpu yolo test cfg/yolo.cfg results/yolo_final.weights <image>

经过测试，在GPU环境下，测试一张图片跑完整的YOLO模型需要40ms~100ms，CPU环境下需要5秒以上。

补充

YOLO官方后来又出了一个yolo2.cfg，研究了一下，每一卷积层都加了batch_normlization，提高训练速度，破费。同时把最后的一个全连接层改成了一个local层

讨论和思考

YOLO的核心思想就是利用整张图作为网络的输入，直接在输出层回归bounding box的位置和bounding box所属的类别。
将一副图像经过类似alexnet的特征提取，最后经过一层的全连接层之后又映射回SxS个网格(grid cell)，如果某个object的中心落在这个网格中，则这个网格就负责预测这个object。
每个网格要预测B个bounding box，每个bounding box除了要回归自身的位置之外，还要附带预测一个confidence值。这个confidence等于所预测的box中含有object的置信度（object落在其中，取1，否则取0）和这个box和truth的IOU交集的乘积确定。这样每个网格要返回bounding box预测的(x, y, w, h)和confidence5个值，以及object的类别信息C类，所以最后一层的输出应该为S x S x (5*B+C)。
由于xywh取的是相对坐标，归一化为0~1，confidence取值范围为0~1，category的类别取值为0或1，简单实现了归一化的目的，然而由于目标函数使用了均方误差损失函数，位置坐标和类别判别信息维度对损失函数的贡献不应该是一样的，所以文章中采用了如下的方法改进

更重视坐标预测，给这些损失前面赋予更大的loss weight, 训练中取5。
对没有object的box的confidence loss，赋予小的loss weight，训练中取0.5。
有object的box的confidence loss和类别的loss的loss weight正常取1。

对于小的box位置偏差对实际的效果影响更为严重，但均方误差对这个没有体现，所以对w和h参数又取了平方根处理，增大的小box的位置偏移对loss函数的影响。
一个网格有多个bounding box的时候，根据IOU值取前B个进行处理。所以对多个物体相互靠近的时候处理的并不是十分理想。
在test一张图片的时候，通过网络得到SxSx(5*B+C)层的值后，每个网格的类别信息Pr1，其对应的bounding box的confidence Pr2以及box和网格的IOU信息的乘积用来预测box属于某一类的概率。根据阈值过滤后，对保留的boxes进行非极大值抑制NMS处理，得到最终结果。

杨现的个人博客

分享计算机视觉、算法、生活累积的点滴