运行Seq2seq模型时，在参数优化过程报错：RuntimeError: (NotFound)

项目

数据集

课程

比赛

模型库

活动

论坛

访问飞桨官网

项目

数据集

课程

比赛

模型库

活动

论坛

访问飞桨官网

SPiCa 发布于2021-07

代码在执行时前向没有问题，在执行Loss.backward()时报错：RuntimeError: (NotFound) Inputs and outputs of squeeze2_grad do not exist. 代码如下，不太熟练，写得不太标准，希望高人能够指点一下，谢谢！

class Encoder(paddle.nn.Layer):
    def __init__(self):
        super(Encoder, self).__init__()
        self.emb = paddle.nn.Embedding(vocab_size, embedding_size,)

        # for single layer lstm encoder model
        self.lstm = paddle.nn.LSTM(input_size=embedding_size, 
                                            hidden_size=hidden_size, 
                                            num_layers=num_encoder_lstm_layers)


    def forward(self, x):
        x = self.emb(x)
        x, states = self.lstm(x)
        return x, states

class AttentionLayer(nn.Layer):
    def __init__(self, hidden_size):
        super(AttentionLayer, self).__init__()
        self.linear1 = nn.Linear(hidden_size*2, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 1)

    def forward(self, hidden, encoder_outputs):
        hidden = paddle.unsqueeze(encoder_hidden, axis=1) # hidden.shape=[batch, hidden]
        attention_inputs = paddle.concat((encoder_outputs, paddle.tile(hidden, repeat_times=[1, MAX_LEN+1, 1])),
            axis=-1
            )

        tmp_tensor = linear1(attention_inputs)
        tmp_tensor = F.tanh(tmp_tensor)
        weights = linear2(tmp_tensor)
        weights = paddle.squeeze(weights)
        attention_weights = F.softmax(weights)        
        attention_weights = paddle.expand_as(paddle.unsqueeze(attention_weights, -1), 
                                                encoder_outputs)

        context_vector = paddle.multiply(encoder_outputs, attention_weights)               
        context_vector = paddle.sum(context_vector, 1)

        return context_vector

class DecoderCell(nn.RNNCellBase):
    def __init__(self, hidden_size, embedding_size):
        super(DecoderCell, self).__init__()
        self.lstm_cell = nn.LSTMCell(embedding_size + hidden_size, hidden_size)
        self.attention_layer = AttentionLayer(hidden_size)

    def forward(self, inputs, initial_states, encoder_outputs):
        # 输入的inputs应该是embedding而不是id
        # print("Decoder cell step inputs:", paddle.shape(inputs))
        context_vector = self.attention_layer(inputs, encoder_outputs)
        lstm_inputs = paddle.concat((inputs, context_vector), axis=-1)
        outputs, new_states = self.lstm_cell(lstm_inputs, initial_states)
        
        
        return outputs, new_states

class Decoder(nn.Layer):
    def __init__(self, hidden_size, embedding_size, vocab_size):
        super(Decoder, self).__init__()
        self.emb = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.RNN(DecoderCell(hidden_size, embedding_size), time_major=False)

    def forward(self, inputs, initital_states, encoder_outputs):
        emb_inputs = self.emb(inputs)
        # print(paddle.shape(emb_inputs))
        # print("Decoder forward inputs shape:", inputs.shape)
        outputs, final_states = self.lstm(emb_inputs, initital_states, encoder_outputs=encoder_outputs)

        return outputs, final_states

class Seq2seq(nn.Layer):
    def __init__(self, hidden_size, embedding_size, vocab_size):
        super(Seq2seq, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder(hidden_size, embedding_size, vocab_size)
        self.output_layer = nn.Linear(hidden_size, vocab_size)

    def forward(self, src, trg):
        encoder_output, (hidden, cell) = self.encoder(src)
        # print("encoder output shape:", encoder_output.shape)
        hidden = paddle.squeeze(hidden)
        cell = paddle.squeeze(cell)
        outputs, _ = self.decoder(trg, (hidden, cell), encoder_output)
        logits = self.output_layer(outputs)

        return logits

# 前面是模型的定义部分，下面是模型的训练部分

epochs = 1
paddle.set_device('cpu')

seq2seq = Seq2seq(hidden_size, embedding_size, vocab_size)
opt = paddle.optimizer.Adam(learning_rate=0.001, parameters=seq2seq.parameters())

for epoch in range(epochs):
    print("epoch:{}".format(epoch))

    # shuffle training data
    perm = np.random.permutation(len(pairs.train_qry_sents))

    for iteration in range(pairs.train_qry_sents.shape[0] // batch_size):
        x_data = pairs.train_qry_sents[(batch_size*iteration):(batch_size*(iteration+1))]
        sent = paddle.to_tensor(x_data)

        x_res_data = pairs.train_res_sents[(batch_size*iteration):(batch_size*(iteration+1))]
        x_res_label_data = pairs.train_res_label_sents[(batch_size*iteration):(batch_size*(iteration+1))]
        res_data = paddle.to_tensor(x_res_data)
        res_label_data = paddle.to_tensor(x_res_label_data)

        loss = paddle.zeros([1])
        logits = seq2seq(sent, res_data)
        # print(paddle.shape(logits), paddle.shape(res_label_data))
        cost = F.cross_entropy(input=logits, label=res_label_data, reduction='none', soft_label=False)
        batch_mean_cost = paddle.mean(cost, axis=[0])
        seq_cost = paddle.sum(batch_mean_cost)
        loss = seq_cost / (MAX_LEN + 2)

        if(iteration % 20 == 0):
            print("iter {}, loss:{}".format(iteration, loss.numpy()))

        loss.backward()
        opt.step()
        opt.clear_grad()

全部评论(2)

JavaRoom

#2 回复于2021-07

试试GPU

三岁

#3 回复于2021-07

如果还报错建议贴出来更多的一些报错内容，报错资料有点少

提issue

需求/bug反馈？一键提issue告诉我们

提pr

发现bug？如果您知道修复办法，欢迎提pr直接参与建设飞桨~