动态图版本的ResNet分类时不收敛
收藏
背景:
pytorch转paddle的,还不是特别熟练。
平台:
1)aistudio,GPU
2)paddlepaddle版本1.7
问题:
用pytorch版本的ResNet代码做mnist的分类问题是ok的,但是换成paddle的动态图版本之后训练过程中发现不收敛。pytorch版本的代码能收敛到0.00几的样子,但是paddle版本的一直在4、5左右徘徊。
我尝试过减小学习率到十的负六次方,但是依然一直震荡。我也拿一个Tensor去测试了每一层的输出shape,和预期的也是一致的。见下面两图。
我写了VGG、GoogleNet、ResNet和DenseNet,只有VGG是可以收敛的,搞了一两天,检查了n遍,还是找不出问题。有没有高手可以帮帮我,急!
代码:
import numpy as np import paddle from paddle import fluid from paddle.fluid import data, dygraph, layers, initializer
class Sequential(dygraph.Layer): def __init__(self, layers: list, debug=False): super(Sequential, self).__init__() self.debug = debug self.layers = layers def __getitem__(self, index): return self.layers[index] def append(self, elem): self.layers.append(elem) def __len__(self): return len(self.layers) def forward(self, x): if self.debug: print(len(self)) y = self[0](x) if self.debug: print(0, y.shape) for i in range(1, len(self)): y = self[i](y) if self.debug: print(i, y.shape) return y
class Residual(dygraph.Layer): def __init__(self, in_channels, out_channels, use_1x1_conv=False, stride=1): super(Residual, self).__init__() self.conv1 = dygraph.Conv2D(in_channels, out_channels, filter_size=3, padding=1, stride=stride) self.conv2 = dygraph.Conv2D(out_channels, out_channels, filter_size=3, padding=1) if use_1x1_conv: self.conv3 = dygraph.Conv2D(in_channels, out_channels, filter_size=1, stride=stride) else: self.conv3 = None self.bn1 = dygraph.BatchNorm(out_channels) self.bn2 = dygraph.BatchNorm(out_channels) def forward(self, x): y = layers.relu(self.bn1(self.conv1(x))) y = self.bn2(self.conv2(y)) if self.conv3 is not None: x = self.conv3(x) return layers.relu(y + x) def resnet_block(in_channels, out_channels, num_residuals, first_block=False): if first_block: assert in_channels == out_channels blk = list() for i in range(num_residuals): if i == 0 and not first_block: blk.append(Residual(in_channels, out_channels, use_1x1_conv=True, stride=2)) else: blk.append(Residual(out_channels, out_channels)) return Sequential(blk) class ResNet(dygraph.Layer): def __init__(self, in_channels, num_classes, resize_shape): super(ResNet, self).__init__() self.resize_block = lambda x: layers.resize_bilinear(x, out_shape=resize_shape) self.in_block = Sequential([ dygraph.Conv2D(in_channels, 64, filter_size=7, stride=2, padding=3), dygraph.BatchNorm(64), layers.relu, dygraph.Pool2D(pool_size=3, pool_stride=2, pool_padding=1) ]) self.resnet_block1 = resnet_block(64, 64, 2, first_block=True) self.resnet_block2 = resnet_block(64, 128, 2) self.resnet_block3 = resnet_block(128, 256, 2) self.resnet_block4 = resnet_block(256, 512, 2) self.global_avg_pool = dygraph.Pool2D(pool_type="avg", global_pooling=True) self.fc = Sequential([ lambda x: layers.reshape(x, [x.shape[0], -1]), dygraph.Linear(512, num_classes, param_attr=fluid.param_attr.ParamAttr(initializer=fluid.initializer.MSRAInitializer()), act="softmax") ]) self.net = Sequential([ self.resize_block, self.in_block, self.resnet_block1, self.resnet_block2, self.resnet_block3, self.resnet_block4, self.global_avg_pool, self.fc ]) def forward(self, x): return self.net(x)
with fluid.dygraph.guard(fluid.CUDAPlace(0)): epoch_num = 20 BATCH_SIZE = 128 target_size = (96, 96) net = ResNet(in_channels=1, num_classes=10, resize_shape=target_size) adam = fluid.optimizer.AdamOptimizer(learning_rate=1e-2, parameter_list=net.parameters()) train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size= BATCH_SIZE, drop_last=True) np.set_printoptions(precision=3, suppress=True) for epoch in range(epoch_num): for batch_id, data in enumerate(train_reader()): dy_x_data = np.array([x[0].reshape(1, 28, 28) for x in data]).astype('float32') y_data = np.array([x[1] for x in data]).astype('int64').reshape(BATCH_SIZE, 1) img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True cost = net(img) loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.mean(loss) avg_loss.backward() adam.minimize(avg_loss) net.clear_gradients() if batch_id % 20 == 0: print("epoch: {}, batch: {}, loss: {}".format(epoch, batch_id, avg_loss.numpy())) print("Final loss: {}".format(avg_loss.numpy()))
0
收藏
请登录后评论
加深一下网络吧,我看你的残差快都是2
害,后面每个残差块的卷积层我加到4了,还是不收敛
vgg和resnet的准确率差多少呢?
你好,非常感谢你这么耐心。我自己经过多次试验,觉得肯定是我写的那个Sequential类的问题,我把那个类改了一下,然后就能work了。我改了之后的代码分享给你看一下:
比较之后,我猜原因可能是这样,所有含需要训练参数的layer都必须是某一个dygraph.Layer的子类的属性。如果是这样的话,原先那个Sequential类不能work而之后那个Sequential类可以work就能说得通了。
不过我还没有找到这方面的文档或资料,如果以后看到的话再在这个贴子里面@你。再次感谢!
赞~
受教了