报错信息:
我的训练部分代码:
# %% 开始训练
def start_train(model):
all_train_iter = 0
all_train_iters = []
all_train_costs = []
all_train_accs = []
all_eval_accs = []
all_eval_iters = []
all_eval_iter = 0
all_eval_costs = []
best_test_acc = 0.0
best_test_train = 0.0
overfit = 0
highest_train = 0.0
now_train = 0.0
model_save_dir = "./work/catdog.inference"
print("start training....")
model.train()
# ******************************************* 调用设置函数以进行schedular/opt修改
opt = setting_opt(model)
# ******************************************* 进行继续训练
offset = 0 # 这个不能注释,不使用时请置0
offset = 215
params_path = './checkpoint/mnist_epoch' + str(offset)
params_dict = paddle.load(params_path+'.pdparams')
opt_dict = paddle.load(params_path+'.pdopt')
model.set_state_dict(params_dict)
opt.set_state_dict(opt_dict)
# *******************************************
for epoch in range(epochs_num):
# 训练阶段
train_acc = []
train_loss = []
for batch_id, (image, label) in enumerate(train_loader()):
label = paddle.fluid.layers.one_hot(label, 10)
predict = model(image)
loss = F.cross_entropy(predict, label, soft_label=True)
m = paddle.metric.Accuracy()
correct = m.compute(predict, label)
m.update(correct)
acc = m.accumulate()
m.reset()
loss.backward()
opt.step()
opt.clear_grad()
train_loss.append(loss.numpy()[0])
train_acc.append(acc)
# 预测阶段
accs = []
test_loss = []
model.eval()
for batch_id, (image, label) in enumerate(test_loader()):
label = paddle.fluid.layers.one_hot(label, 10)
predict = model(image)
m = paddle.metric.Accuracy()
correct = m.compute(predict, label)
m.update(correct)
acc = m.accumulate()
m.reset()
accs.append(acc)
avg_acc = np.mean(accs)
loss = F.cross_entropy(predict, label, soft_label=True)
avg_loss = paddle.mean(loss)
test_loss.append(avg_loss.numpy()[0])
# ******************************************* 保存模型
now_train = np.mean(train_acc)
if now_train > highest_train:
highest_train = now_train
if avg_acc >= best_test_acc:
best_test_acc = avg_acc
best_test_train = now_train
paddle.save(model.state_dict(), model_save_dir)
paddle.save(opt.state_dict(), model_save_dir+'opt')
print('Test:%d, Accuracy:%0.5f, Best: %0.5f, Train:%0.5f, BestTrain: %0.5f, Train_Loss: %0.5f, Test_Loss: %0.5f' % (
epoch + offset, avg_acc, best_test_acc, now_train, best_test_train, np.sum(train_loss), np.mean(test_loss)
)
)
if (epoch + offset) % 5 == 0:
paddle.save(model.state_dict(), './checkpoint/mnist_epoch{}'.format(epoch + offset) + '.pdparams')
paddle.save(opt.state_dict(), './checkpoint/mnist_epoch{}'.format(epoch + offset) + '.pdopt')
model.train()
# %% 运行
use_gpu = True
paddle.set_device('gpu:0') if use_gpu else paddle.set_device('cpu')
model = ResNet0()
start_train(model)
print(1)
opt = setting_opt(model)
setting_opt 这个函数的实现是?
模型结构的bug也可能导致opt存储失败
我曾经保存的opt只有44k
优化器的保存、读取和模型权重是一样的