运行Seq2seq模型时,在参数优化过程报错:RuntimeError: (NotFound)
收藏
代码在执行时前向没有问题,在执行Loss.backward()时报错:RuntimeError: (NotFound) Inputs and outputs of squeeze2_grad do not exist. 代码如下,不太熟练,写得不太标准,希望高人能够指点一下,谢谢!
class Encoder(paddle.nn.Layer):
def __init__(self):
super(Encoder, self).__init__()
self.emb = paddle.nn.Embedding(vocab_size, embedding_size,)
# for single layer lstm encoder model
self.lstm = paddle.nn.LSTM(input_size=embedding_size,
hidden_size=hidden_size,
num_layers=num_encoder_lstm_layers)
def forward(self, x):
x = self.emb(x)
x, states = self.lstm(x)
return x, states
class AttentionLayer(nn.Layer):
def __init__(self, hidden_size):
super(AttentionLayer, self).__init__()
self.linear1 = nn.Linear(hidden_size*2, hidden_size)
self.linear2 = nn.Linear(hidden_size, 1)
def forward(self, hidden, encoder_outputs):
hidden = paddle.unsqueeze(encoder_hidden, axis=1) # hidden.shape=[batch, hidden]
attention_inputs = paddle.concat((encoder_outputs, paddle.tile(hidden, repeat_times=[1, MAX_LEN+1, 1])),
axis=-1
)
tmp_tensor = linear1(attention_inputs)
tmp_tensor = F.tanh(tmp_tensor)
weights = linear2(tmp_tensor)
weights = paddle.squeeze(weights)
attention_weights = F.softmax(weights)
attention_weights = paddle.expand_as(paddle.unsqueeze(attention_weights, -1),
encoder_outputs)
context_vector = paddle.multiply(encoder_outputs, attention_weights)
context_vector = paddle.sum(context_vector, 1)
return context_vector
class DecoderCell(nn.RNNCellBase):
def __init__(self, hidden_size, embedding_size):
super(DecoderCell, self).__init__()
self.lstm_cell = nn.LSTMCell(embedding_size + hidden_size, hidden_size)
self.attention_layer = AttentionLayer(hidden_size)
def forward(self, inputs, initial_states, encoder_outputs):
# 输入的inputs应该是embedding而不是id
# print("Decoder cell step inputs:", paddle.shape(inputs))
context_vector = self.attention_layer(inputs, encoder_outputs)
lstm_inputs = paddle.concat((inputs, context_vector), axis=-1)
outputs, new_states = self.lstm_cell(lstm_inputs, initial_states)
return outputs, new_states
class Decoder(nn.Layer):
def __init__(self, hidden_size, embedding_size, vocab_size):
super(Decoder, self).__init__()
self.emb = nn.Embedding(vocab_size, embedding_size)
self.lstm = nn.RNN(DecoderCell(hidden_size, embedding_size), time_major=False)
def forward(self, inputs, initital_states, encoder_outputs):
emb_inputs = self.emb(inputs)
# print(paddle.shape(emb_inputs))
# print("Decoder forward inputs shape:", inputs.shape)
outputs, final_states = self.lstm(emb_inputs, initital_states, encoder_outputs=encoder_outputs)
return outputs, final_states
class Seq2seq(nn.Layer):
def __init__(self, hidden_size, embedding_size, vocab_size):
super(Seq2seq, self).__init__()
self.encoder = Encoder()
self.decoder = Decoder(hidden_size, embedding_size, vocab_size)
self.output_layer = nn.Linear(hidden_size, vocab_size)
def forward(self, src, trg):
encoder_output, (hidden, cell) = self.encoder(src)
# print("encoder output shape:", encoder_output.shape)
hidden = paddle.squeeze(hidden)
cell = paddle.squeeze(cell)
outputs, _ = self.decoder(trg, (hidden, cell), encoder_output)
logits = self.output_layer(outputs)
return logits
# 前面是模型的定义部分,下面是模型的训练部分
epochs = 1
paddle.set_device('cpu')
seq2seq = Seq2seq(hidden_size, embedding_size, vocab_size)
opt = paddle.optimizer.Adam(learning_rate=0.001, parameters=seq2seq.parameters())
for epoch in range(epochs):
print("epoch:{}".format(epoch))
# shuffle training data
perm = np.random.permutation(len(pairs.train_qry_sents))
for iteration in range(pairs.train_qry_sents.shape[0] // batch_size):
x_data = pairs.train_qry_sents[(batch_size*iteration):(batch_size*(iteration+1))]
sent = paddle.to_tensor(x_data)
x_res_data = pairs.train_res_sents[(batch_size*iteration):(batch_size*(iteration+1))]
x_res_label_data = pairs.train_res_label_sents[(batch_size*iteration):(batch_size*(iteration+1))]
res_data = paddle.to_tensor(x_res_data)
res_label_data = paddle.to_tensor(x_res_label_data)
loss = paddle.zeros([1])
logits = seq2seq(sent, res_data)
# print(paddle.shape(logits), paddle.shape(res_label_data))
cost = F.cross_entropy(input=logits, label=res_label_data, reduction='none', soft_label=False)
batch_mean_cost = paddle.mean(cost, axis=[0])
seq_cost = paddle.sum(batch_mean_cost)
loss = seq_cost / (MAX_LEN + 2)
if(iteration % 20 == 0):
print("iter {}, loss:{}".format(iteration, loss.numpy()))
loss.backward()
opt.step()
opt.clear_grad()
0
收藏
请登录后评论
试试GPU
如果还报错建议贴出来更多的一些报错内容,报错资料有点少