猫狗分类如何提高准确率？

项目
数据集
课程
比赛
模型库
活动
论坛
访问飞桨官网
项目
数据集
课程
比赛
模型库
活动
论坛
访问飞桨官网
s skygoodboy 发布于2022-11
各位大大好。
小弟想用官方关于眼部识别的教程改成猫狗分类的代码。用了ALEXNET/VGG/GOOGLENET三个模型，但效果都不怎么好。准确率也只有60多，不到70. 我想请教一下，我应该怎么去提高我的准确率呢？或者是不是我的代码上有什么不对呢？烦请指教，谢谢。
import paddle
import numpy as np
from paddle.nn import Conv2D, MaxPool2D, Linear, Dropout, AdaptiveAvgPool2D
import cv2
import os
import paddle.nn.functional as F

import random


class Inception(paddle.nn.Layer):
    def __init__(self, c0, c1, c2, c3, c4, **kwargs):
        '''
        Inception模块的实现代码，

        c1,图(b)中第一条支路1x1卷积的输出通道数，数据类型是整数
        c2,图(b)中第二条支路卷积的输出通道数，数据类型是tuple或list, 
               其中c2[0]是1x1卷积的输出通道数，c2[1]是3x3
        c3,图(b)中第三条支路卷积的输出通道数，数据类型是tuple或list, 
               其中c3[0]是1x1卷积的输出通道数，c3[1]是3x3
        c4,图(b)中第一条支路1x1卷积的输出通道数，数据类型是整数
        '''
        super(Inception, self).__init__()
        # 依次创建Inception块每条支路上使用到的操作
        self.p1_1 = Conv2D(in_channels=c0, out_channels=c1,
                           kernel_size=1, stride=1)
        self.p2_1 = Conv2D(
            in_channels=c0, out_channels=c2[0], kernel_size=1, stride=1)
        self.p2_2 = Conv2D(
            in_channels=c2[0], out_channels=c2[1], kernel_size=3, padding=1, stride=1)
        self.p3_1 = Conv2D(
            in_channels=c0, out_channels=c3[0], kernel_size=1, stride=1)
        self.p3_2 = Conv2D(
            in_channels=c3[0], out_channels=c3[1], kernel_size=5, padding=2, stride=1)
        self.p4_1 = MaxPool2D(kernel_size=3, stride=1, padding=1)
        self.p4_2 = Conv2D(in_channels=c0, out_channels=c4,
                           kernel_size=1, stride=1)

        # # 新加一层batchnorm稳定收敛
        # self.batchnorm = paddle.nn.BatchNorm2D(c1+c2[1]+c3[1]+c4)

    def forward(self, x):
        # 支路1只包含一个1x1卷积
        p1 = F.relu(self.p1_1(x))
        # 支路2包含 1x1卷积 + 3x3卷积
        p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
        # 支路3包含 1x1卷积 + 5x5卷积
        p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
        # 支路4包含 最大池化和1x1卷积
        p4 = F.relu(self.p4_2(self.p4_1(x)))
        # 将每个支路的输出特征图拼接在一起作为最终的输出结果
        return paddle.concat([p1, p2, p3, p4], axis=1)
        # return self.batchnorm()


class GoogLeNet(paddle.nn.Layer):
    def __init__(self):
        super(GoogLeNet, self).__init__()
        # GoogLeNet包含五个模块，每个模块后面紧跟一个池化层
        # 第一个模块包含1个卷积层
        self.conv1 = Conv2D(in_channels=3, out_channels=64,
                            kernel_size=7, padding=3, stride=1)
        # 3x3最大池化
        self.pool1 = MaxPool2D(kernel_size=3, stride=2, padding=1)
        # 第二个模块包含2个卷积层
        self.conv2_1 = Conv2D(
            in_channels=64, out_channels=64, kernel_size=1, stride=1)
        self.conv2_2 = Conv2D(in_channels=64, out_channels=192,
                              kernel_size=3, padding=1, stride=1)
        # 3x3最大池化
        self.pool2 = MaxPool2D(kernel_size=3, stride=2, padding=1)
        # 第三个模块包含2个Inception块
        self.block3_1 = Inception(192, 64, (96, 128), (16, 32), 32)
        self.block3_2 = Inception(256, 128, (128, 192), (32, 96), 64)
        # 3x3最大池化
        self.pool3 = MaxPool2D(kernel_size=3, stride=2, padding=1)
        # 第四个模块包含5个Inception块
        self.block4_1 = Inception(480, 192, (96, 208), (16, 48), 64)
        self.block4_2 = Inception(512, 160, (112, 224), (24, 64), 64)
        self.block4_3 = Inception(512, 128, (128, 256), (24, 64), 64)
        self.block4_4 = Inception(512, 112, (144, 288), (32, 64), 64)
        self.block4_5 = Inception(528, 256, (160, 320), (32, 128), 128)
        # 3x3最大池化
        self.pool4 = MaxPool2D(kernel_size=3, stride=2, padding=1)
        # 第五个模块包含2个Inception块
        self.block5_1 = Inception(832, 256, (160, 320), (32, 128), 128)
        self.block5_2 = Inception(832, 384, (192, 384), (48, 128), 128)
        # 全局池化，用的是global_pooling，不需要设置pool_stride
        self.pool5 = AdaptiveAvgPool2D(output_size=1)
        self.fc = Linear(in_features=1024, out_features=2)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2_2(F.relu(self.conv2_1(x)))))
        x = self.pool3(self.block3_2(self.block3_1(x)))
        x = self.block4_3(self.block4_2(self.block4_1(x)))
        x = self.pool4(self.block4_5(self.block4_4(x)))
        x = self.pool5(self.block5_2(self.block5_1(x)))
        x = paddle.reshape(x, [x.shape[0], -1])
        x = self.fc(x)
        return x


class VGGNet(paddle.nn.Layer):
    def __init__(self, num_classes=1):
        super(VGGNet, self).__init__()

        self.conv1_1 = Conv2D(in_channels=3, out_channels=64,
                              kernel_size=3, stride=1, padding=1)
        self.conv1_2 = Conv2D(in_channels=64, out_channels=64,
                              kernel_size=3, stride=1, padding=1)
        self.max_pool1 = MaxPool2D(kernel_size=2, stride=2)
        self.conv2_1 = Conv2D(in_channels=64, out_channels=128,
                              kernel_size=3, stride=1, padding=1)
        self.conv2_2 = Conv2D(in_channels=128, out_channels=128,
                              kernel_size=3, stride=1, padding=1)
        self.conv2_3 = Conv2D(in_channels=128, out_channels=128,
                              kernel_size=3, stride=1, padding=1)
        self.max_pool2 = MaxPool2D(kernel_size=2, stride=2)
        self.conv3_1 = Conv2D(in_channels=128, out_channels=256,
                              kernel_size=3, stride=1, padding=1)
        self.conv3_2 = Conv2D(in_channels=256, out_channels=256,
                              kernel_size=3, stride=1, padding=1)
        self.conv3_3 = Conv2D(in_channels=256, out_channels=256,
                              kernel_size=3, stride=1, padding=1)
        self.max_pool3 = MaxPool2D(kernel_size=2, stride=2)
        self.conv4_1 = Conv2D(in_channels=256, out_channels=512,
                              kernel_size=3, stride=1, padding=1)
        self.conv4_2 = Conv2D(in_channels=512, out_channels=512,
                              kernel_size=3, stride=1, padding=1)
        self.conv4_3 = Conv2D(in_channels=512, out_channels=512,
                              kernel_size=3, stride=1, padding=1)
        self.max_pool4 = MaxPool2D(kernel_size=2, stride=2)
        self.conv5_1 = Conv2D(in_channels=512, out_channels=512,
                              kernel_size=3, stride=1, padding=1)
        self.conv5_2 = Conv2D(in_channels=512, out_channels=512,
                              kernel_size=3, stride=1, padding=1)
        self.conv5_3 = Conv2D(in_channels=512, out_channels=512,
                              kernel_size=3, stride=1, padding=1)
        # 使用Sequential 将全连接层和relu组成一个线性结构（fc + relu）
        # 当输入为224x224时，经过五个卷积块和池化层后，特征维度变为[512x7x7]
        self.fc1 = paddle.nn.Sequential(
            paddle.nn.Linear(512 * 7 * 7, 4096), paddle.nn.ReLU())
        self.drop1_ratio = 0.5
        self.dropout1 = paddle.nn.Dropout(
            self.drop1_ratio, mode='upscale_in_train')
        # 使用Sequential 将全连接层和relu组成一个线性结构（fc + relu）
        self.fc2 = paddle.nn.Sequential(
            paddle.nn.Linear(4096, 4096), paddle.nn.ReLU())

        self.drop2_ratio = 0.5
        self.dropout2 = paddle.nn.Dropout(
            self.drop2_ratio, mode='upscale_in_train')
        self.fc3 = paddle.nn.Linear(4096, 2)

        self.relu = paddle.nn.ReLU()
        self.pool = MaxPool2D(stride=2, kernel_size=2)

    def forward(self, x, label=None):
        x = self.relu(self.conv1_1(x))
        x = self.relu(self.conv1_2(x))
        x = self.pool(x)

        x = self.relu(self.conv2_1(x))
        x = self.relu(self.conv2_2(x))
        x = self.pool(x)

        x = self.relu(self.conv3_1(x))
        x = self.relu(self.conv3_2(x))
        x = self.relu(self.conv3_3(x))
        x = self.pool(x)

        x = self.relu(self.conv4_1(x))
        x = self.relu(self.conv4_2(x))
        x = self.relu(self.conv4_3(x))
        x = self.pool(x)

        x = self.relu(self.conv5_1(x))
        x = self.relu(self.conv5_2(x))
        x = self.relu(self.conv5_3(x))
        x = self.pool(x)

        x = paddle.flatten(x, 1, -1)
        x = self.dropout1(self.relu(self.fc1(x)))
        x = self.dropout2(self.relu(self.fc2(x)))
        x = self.fc3(x)
        if label is not None:
            acc = paddle.metric.accuracy(input=x, label=label)
            # print("hello")
            # print(acc)
            return x, acc
        else:
            return x


class AlexNet(paddle.nn.Layer):
    def __init__(self, num_classes=1):
        super(AlexNet, self).__init__()
        self.num_classes = num_classes
        self.conv1 = Conv2D(in_channels=3, out_channels=96, kernel_size=11,
                            stride=4, padding=5)
        self.max_pool1 = MaxPool2D(kernel_size=2, stride=2)
        self.conv2 = Conv2D(in_channels=96, out_channels=256,
                            kernel_size=5, stride=1, padding=2)
        self.max_pool2 = MaxPool2D(kernel_size=2, stride=2)
        self.conv3 = Conv2D(in_channels=256, out_channels=384,
                            kernel_size=3, stride=1, padding=1)
        self.conv4 = Conv2D(in_channels=384, out_channels=384,
                            kernel_size=3, stride=1, padding=1)
        self.conv5 = Conv2D(in_channels=384, out_channels=256,
                            kernel_size=3, stride=1, padding=1)
        self.max_pool3 = MaxPool2D(kernel_size=2, stride=2)

        self.fc1 = Linear(in_features=12544, out_features=4096)
        self.drop_ratio1 = 0.5
        self.drop1 = Dropout(self.drop_ratio1)
        self.fc2 = Linear(in_features=4096, out_features=4096)
        self.drop_ratio2 = 0.5
        self.drop2 = Dropout(self.drop_ratio2)
        self.fc3 = Linear(in_features=4096, out_features=num_classes)
        #self.sigmoid = paddle.nn.Sigmoid()

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.max_pool1(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.max_pool2(x)
        x = self.conv3(x)
        x = F.relu(x)
        x = self.conv4(x)
        x = F.relu(x)
        x = self.conv5(x)
        x = F.relu(x)
        x = self.max_pool3(x)
        x = paddle.reshape(x, [x.shape[0], -1])
        x = self.fc1(x)
        x = F.relu(x)
        x = self.drop1(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.drop2(x)

        x = self.fc3(x)
        #x = self.sigmoid(x)

        return x


def transform_img(img):
    img = cv2.resize(img, (224, 224))
    img = np.transpose(img, (2, 0, 1))
    img = img.astype('float32')
    img = img / 255
    img = img * 2.0 - 1.0
    return img


def train_data_loader(bacth_size=50):
    train_file_path = "./Data/train/"
    imgs_file_list = []
    label_list = []
    for dirpath, dirnames, filenames in os.walk(train_file_path):
        for filename in filenames:
            if filename.find("jpg") >= 0:
                path = os.path.join(dirpath, filename)
                imgs_file_list.append(path)
    random.shuffle(imgs_file_list)
    # print(imgs_file_list)
    batch_imgs = []
    batch_labels = []
    for one_file in imgs_file_list:
        img = cv2.imread(one_file)
        img = transform_img(img)
        if one_file.find("cat") >= 0:
            label_list.append([0])
        elif one_file.find("dog") >= 0:
            label_list.append([1])
        else:
            raise("invalid file path")
        batch_imgs.append(img)
        # batch_labels.append(label_list)
        if len(batch_imgs) == bacth_size:
            imgs_array = np.array(batch_imgs).astype('float32')
            # print(label_list)
            #print("%d in total %d file",)
            labels_array = np.array(label_list).astype(
                'float32').reshape(-1, 1)
            yield imgs_array, labels_array
            batch_imgs = []
            label_list = []

    if len(batch_imgs) > 0:
        imgs_array = np.array(batch_imgs).astype('float32')
        labels_array = np.array(label_list).astype(
            'int').reshape(-1, 1)
        yield imgs_array, labels_array


def valid_data_loader(bacth_size=10):
    train_file_path = "./Data/validation/"
    imgs_file_list = []
    label_list = []
    for dirpath, dirnames, filenames in os.walk(train_file_path):
        for filename in filenames:
            if filename.find("jpg") >= 0:
                path = os.path.join(dirpath, filename)
                imgs_file_list.append(path)
    random.shuffle(imgs_file_list)
    print(imgs_file_list)
    batch_imgs = []
    batch_labels = []
    for one_file in imgs_file_list:
        # print(one_file)
        img = cv2.imread(one_file)
        # print(img.shape)
        img = transform_img(img)
        if one_file.find("cat") >= 0:
            label_list.append(0)
        elif one_file.find("dog") >= 0:
            label_list.append(1)
        else:
            raise("invalid file path")
        batch_imgs.append(img)
        # batch_labels.append(label_list)
        if len(batch_imgs) == bacth_size:
            imgs_array = np.array(batch_imgs).astype('float32')
            # print(label_list)
            #print("%d in total %d file",)
            labels_array = np.array(label_list).astype(
                'float32').reshape(10, -1)
            yield imgs_array, labels_array
            batch_imgs = []
            label_list = []

    if len(batch_imgs) > 0:
        imgs_array = np.array(batch_imgs).astype('float32')
        labels_array = np.array(label_list).astype(
            'float32').reshape(-1, 1)
        yield imgs_array, labels_array


train_data_loader()
my_alex_net = AlexNet()
my_google_net = GoogLeNet()
my_VGG_net = VGGNet()
EPOCH_NUM = 5


def train(model, optimizer):
    use_gpu = True
    paddle.device.set_device(
        'gpu:0') if use_gpu else paddle.device.set_device('cpu')

    print("start training ...")
    for epoch in range(EPOCH_NUM):
        for batch_id, data in enumerate(train_data_loader()):
            x_data, y_data = data
            # print(y_data)
            img = paddle.to_tensor(x_data)
            # print(img.shape)
            label = paddle.to_tensor(y_data)
            estimate_label = model(img)
            # print(estimate_label)
            # print(label)
            loss = F.binary_cross_entropy_with_logits(estimate_label, label)
            avg_loss = paddle.mean(loss)
            if batch_id % 20 == 0:
                print("epoch: {}, batch_id: {}, loss is: {:.4f}".format(
                    epoch, batch_id, float(avg_loss.numpy())))
            # 反向传播，更新权重，清除梯度
            avg_loss.backward()
            optimizer.step()
            optimizer.clear_grad()

    model.eval()
    accuracies = []
    losses = []
    for batch_id, data in enumerate(valid_data_loader()):
        x_data, y_data = data
        img = paddle.to_tensor(x_data)
        # print(x_data.shape)
        # print(y_data.shape)
        label = paddle.to_tensor(y_data)
        estimate_label = model(img)
        pred = F.sigmoid(estimate_label)
        # print(pred)
        loss = F.binary_cross_entropy_with_logits(estimate_label, label)
        pred2 = pred * (-1.0) + 1.0
        pred = paddle.concat([pred2, pred], axis=1)
        acc = paddle.metric.accuracy(
            pred, paddle.cast(label, dtype='int'))
        accuracies.append(acc.numpy())
        losses.append(loss.numpy())
    # print(len(accuracies))
    # print(len(losses))
    print(
        "[validation] accuracy/loss: {:.4f}/{:.4f}".format(np.mean(accuracies), np.mean(losses)))
    model.train()


opt = paddle.optimizer.Momentum(
    learning_rate=0.001, momentum=0.9, parameters=my_alex_net.parameters(), weight_decay=0.001)
# 启动训练过程
train(my_alex_net, opt)
全部评论(8)
beyondyourself
#2 回复于2022-11
试试resnet
skygoodboy
#3 回复于2022-11
beyondyourself #2
试试resnet
试了，更差了。
[validation] accuracy/loss: 0.5643/0.6964
是不是我的代码的accuracies计算有什么问题？
李长安
#4 回复于2022-12
大佬
3荡仔
#5 回复于2022-12
Linq1呀
#7 回复于2022-12
训练的epoch数量不够吧，你代码里写的EPOCH_NUM是5，同时可能epoch和优化器学习率之类的也不匹配
skygoodboy
#8 回复于2022-12
Linq1呀 #7
训练的epoch数量不够吧，你代码里写的EPOCH_NUM是5，同时可能epoch和优化器学习率之类的也不匹配
好的，谢谢！我提高看看！
skygoodboy
#9 回复于2022-12
skygoodboy #8
好的，谢谢！我提高看看！
提高EPNUM是不行的，我查看了图片，发现图片并不是规则的，所以有可能是因为图片大小不一，我强行转换成3x222x224，会造成失真。
准备看看用填充的方法来试试。
3荡仔
#10 回复于2022-12
提issue
需求/bug反馈？一键提issue告诉我们
提pr
发现bug？如果您知道修复办法，欢迎提pr直接参与建设飞桨~