动态图版本的ResNet分类时不收敛
收藏
背景:
pytorch转paddle的,还不是特别熟练。
平台:
1)aistudio,GPU
2)paddlepaddle版本1.7
问题:
用pytorch版本的ResNet代码做mnist的分类问题是ok的,但是换成paddle的动态图版本之后训练过程中发现不收敛。pytorch版本的代码能收敛到0.00几的样子,但是paddle版本的一直在4、5左右徘徊。
我尝试过减小学习率到十的负六次方,但是依然一直震荡。我也拿一个Tensor去测试了每一层的输出shape,和预期的也是一致的。见下面两图。
我写了VGG、GoogleNet、ResNet和DenseNet,只有VGG是可以收敛的,搞了一两天,检查了n遍,还是找不出问题。有没有高手可以帮帮我,急!
代码:
import numpy as np
import paddle
from paddle import fluid
from paddle.fluid import data, dygraph, layers, initializer
class Sequential(dygraph.Layer):
def __init__(self, layers: list, debug=False):
super(Sequential, self).__init__()
self.debug = debug
self.layers = layers
def __getitem__(self, index):
return self.layers[index]
def append(self, elem):
self.layers.append(elem)
def __len__(self):
return len(self.layers)
def forward(self, x):
if self.debug:
print(len(self))
y = self[0](x)
if self.debug:
print(0, y.shape)
for i in range(1, len(self)):
y = self[i](y)
if self.debug:
print(i, y.shape)
return y
class Residual(dygraph.Layer):
def __init__(self, in_channels, out_channels, use_1x1_conv=False, stride=1):
super(Residual, self).__init__()
self.conv1 = dygraph.Conv2D(in_channels, out_channels, filter_size=3, padding=1, stride=stride)
self.conv2 = dygraph.Conv2D(out_channels, out_channels, filter_size=3, padding=1)
if use_1x1_conv:
self.conv3 = dygraph.Conv2D(in_channels, out_channels, filter_size=1, stride=stride)
else:
self.conv3 = None
self.bn1 = dygraph.BatchNorm(out_channels)
self.bn2 = dygraph.BatchNorm(out_channels)
def forward(self, x):
y = layers.relu(self.bn1(self.conv1(x)))
y = self.bn2(self.conv2(y))
if self.conv3 is not None:
x = self.conv3(x)
return layers.relu(y + x)
def resnet_block(in_channels, out_channels, num_residuals, first_block=False):
if first_block:
assert in_channels == out_channels
blk = list()
for i in range(num_residuals):
if i == 0 and not first_block:
blk.append(Residual(in_channels, out_channels, use_1x1_conv=True, stride=2))
else:
blk.append(Residual(out_channels, out_channels))
return Sequential(blk)
class ResNet(dygraph.Layer):
def __init__(self, in_channels, num_classes, resize_shape):
super(ResNet, self).__init__()
self.resize_block = lambda x: layers.resize_bilinear(x, out_shape=resize_shape)
self.in_block = Sequential([
dygraph.Conv2D(in_channels, 64, filter_size=7, stride=2, padding=3),
dygraph.BatchNorm(64),
layers.relu,
dygraph.Pool2D(pool_size=3, pool_stride=2, pool_padding=1)
])
self.resnet_block1 = resnet_block(64, 64, 2, first_block=True)
self.resnet_block2 = resnet_block(64, 128, 2)
self.resnet_block3 = resnet_block(128, 256, 2)
self.resnet_block4 = resnet_block(256, 512, 2)
self.global_avg_pool = dygraph.Pool2D(pool_type="avg", global_pooling=True)
self.fc = Sequential([
lambda x: layers.reshape(x, [x.shape[0], -1]),
dygraph.Linear(512, num_classes, param_attr=fluid.param_attr.ParamAttr(initializer=fluid.initializer.MSRAInitializer()), act="softmax")
])
self.net = Sequential([
self.resize_block,
self.in_block,
self.resnet_block1,
self.resnet_block2,
self.resnet_block3,
self.resnet_block4,
self.global_avg_pool,
self.fc
])
def forward(self, x):
return self.net(x)
with fluid.dygraph.guard(fluid.CUDAPlace(0)):
epoch_num = 20
BATCH_SIZE = 128
target_size = (96, 96)
net = ResNet(in_channels=1, num_classes=10, resize_shape=target_size)
adam = fluid.optimizer.AdamOptimizer(learning_rate=1e-2, parameter_list=net.parameters())
train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size= BATCH_SIZE, drop_last=True)
np.set_printoptions(precision=3, suppress=True)
for epoch in range(epoch_num):
for batch_id, data in enumerate(train_reader()):
dy_x_data = np.array([x[0].reshape(1, 28, 28) for x in data]).astype('float32')
y_data = np.array([x[1] for x in data]).astype('int64').reshape(BATCH_SIZE, 1)
img = fluid.dygraph.to_variable(dy_x_data)
label = fluid.dygraph.to_variable(y_data)
label.stop_gradient = True
cost = net(img)
loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss)
avg_loss.backward()
adam.minimize(avg_loss)
net.clear_gradients()
if batch_id % 20 == 0:
print("epoch: {}, batch: {}, loss: {}".format(epoch, batch_id, avg_loss.numpy()))
print("Final loss: {}".format(avg_loss.numpy()))
0
收藏
请登录后评论
加深一下网络吧,我看你的残差快都是2
害,后面每个残差块的卷积层我加到4了,还是不收敛
vgg和resnet的准确率差多少呢?
你好,非常感谢你这么耐心。我自己经过多次试验,觉得肯定是我写的那个Sequential类的问题,我把那个类改了一下,然后就能work了。我改了之后的代码分享给你看一下:
比较之后,我猜原因可能是这样,所有含需要训练参数的layer都必须是某一个dygraph.Layer的子类的属性。如果是这样的话,原先那个Sequential类不能work而之后那个Sequential类可以work就能说得通了。
不过我还没有找到这方面的文档或资料,如果以后看到的话再在这个贴子里面@你。再次感谢!
赞~
受教了