模型是根据pytorch模型改的paddle模型,做的图像生成文本,损失函数用的CrossEntropyLoss,优化器Adam,
torch和paddle两个模型结构基本一模一样,两边同时运行,第一轮的loss值也接近,但是再训练torch模型的损失马上下降了,而paddle的一直上下徘徊,麻烦看看吧球球了
下面是时模型训练过程
下面代码是生成器generator
class Generator(nn.Layer):
def __init__(self,
attention_dim,
embedding_dim,
gru_units,
vocab_size,
encoder_dim=2048,
dropout=0.5
):
super(Generator, self).__init__()
self.encoder_dim = encoder_dim
self.attention_dim = attention_dim
self.embedding_dim = embedding_dim
self.gru_units = gru_units
self.vocab_size = vocab_size
self.dropout = dropout
self.attention_net = Attention(encoder_dim, gru_units, attention_dim)
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.dropout = nn.Dropout(p=self.dropout)
self.gru = nn.GRUCell(embedding_dim + encoder_dim, gru_units)#改了bias=True
self.init_h = nn.Linear(encoder_dim, gru_units)
self.f_beta = nn.Linear(gru_units, encoder_dim)
self.sigmoid = nn.Sigmoid()
self.fc = nn.Linear(gru_units, vocab_size)
self.softmax = nn.Softmax(axis=1)
self.relu = nn.ReLU()
def init_hidden_state(self, img_feats):
mean_img_feats = img_feats.mean(axis=1)
hidden = self.init_h(mean_img_feats)
hidden = self.relu(hidden)
return hidden
def forward(self, img_feats, caps, cap_lens):
batch_size = img_feats.shape[0]
vocab_size = self.vocab_size
num_pixels = img_feats.shape[1]
indices=paddle.argsort(cap_lens,axis=0,descending=True)
indices=indices.numpy().tolist()
cap_lens= cap_lens.sort(axis=0, descending=True)
img_feats = paddle.to_tensor(img_feats.numpy()[indices])
caps =paddle.to_tensor(caps.numpy()[indices])
caps=paddle.to_tensor(caps,dtype='int64')
embeddings = self.embedding(caps)
hidden_state = self.init_hidden_state(img_feats)
output_lens = (cap_lens - 1).numpy().tolist()
preds = paddle.zeros((batch_size, caps.shape[1] - 1, vocab_size))
alphas = paddle.zeros((batch_size, caps.shape[1] - 1, num_pixels))
for t in range(max(output_lens)):
context_vec, alpha = self.attention_net(img_feats, hidden_state)
gate = self.sigmoid(self.f_beta(hidden_state))
context_vec = gate * context_vec
_,hidden_state = self.gru(paddle.concat([embeddings[:, t],
context_vec], axis=1), hidden_state)
preds[:, t] = self.fc(self.dropout(hidden_state))
alphas[:, t] = alpha
return preds, caps, output_lens, alphas, indices
def step(self, input_word, hidden_state, img_feats): # 训练判别器时生成预测,根据图像特征和’start‘
embeddings = self.embedding(input_word)
context_vec, alpha = self.attention_net(img_feats, hidden_state)
gate = self.sigmoid(self.f_beta(hidden_state))
context_vec = gate * context_vec
hidden_state,_ = self.gru(paddle.concat([embeddings, context_vec], axis=1), hidden_state)
preds = self.softmax(self.fc(hidden_state))
return preds, hidden_state
def sample(self, cap_len, col_shape, img_feats, input_word, sampling_method='multinomial', hidden_state=None):
samples = paddle.zeros([input_word.shape[0], col_shape])
if hidden_state is None:
hidden_states = paddle.zeros([input_word.shape[0], col_shape, self.gru_units])#device
hidden_state = self.init_hidden_state(img_feats)
samples[:, 0] = input_word
for i in range(cap_len):
preds, hidden_state = self.step(input_word, hidden_state, img_feats)
if sampling_method == 'multinomial': # 多项分布抽样
input_word = paddle.multinomial(preds, 1) # 每一行抽一个标签
input_word = input_word.squeeze(-1)
else:
input_word = paddle.argmax(preds, 1)
samples[:, i + 1] = input_word
hidden_states[:, i] = hidden_state
return samples, hidden_states
else:
for i in range(cap_len):
preds, hidden_state = self.step(input_word, hidden_state, img_feats)
if sampling_method == 'multinomial':
input_word = paddle.multinomial(preds, 1)
input_word = input_word.squeeze(-1)
else:
input_word = paddle.argmax(preds, 1)
samples[:, i] = input_word
return samples
改小学习率试试,可能有奇效~~
由于交叉熵损失函数默认自带Softmax激活函数,此处需在组网阶段去掉额外的Softmax,避免因组网阶段的Softmax+交叉熵损失函数中自带的Softmax双Softmax组合导致的无法收敛或收敛过慢。
有关交叉熵损失函数文档详见:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/nn/CrossEntropyLoss_cn.html#crossentropyloss
softmax删除了,然后改了交叉熵函数,但还是不行欸
softmax删除了,然后改了交叉熵函数,但还是不行欸
改小学习率不管用么?我在将《动手学深度学习》的pytorch经典cv模型源码转成paddle时也遇过这个问题,学习率将为1/10就训得动了
可能权重初始化方式不太一致?
我之前也改过一个代码,图卷积相关的,最后paddle版本的准确率总是比pytorch版本的低大概1%......