同一段程序重复运行报错

项目

数据集

课程

比赛

模型库

活动

论坛

访问飞桨官网

项目

数据集

课程

比赛

模型库

活动

论坛

访问飞桨官网

静静心幽谷发布于2021-01

论文引用网络节点分类比赛的Baseline

from easydict import EasyDict as edict
import pgl
import model
import paddle.fluid as fluid
import numpy as np
import time
from build_model import build_model
def nn(dataset,train_index,train_label,val_index,val_label,dp,lr,wd,ed,num):
    from easydict import EasyDict as edict
    import pgl
    import model
    import paddle.fluid as fluid
    import numpy as np
    import time
    from build_model import build_model
    dataset=dataset
    train_index=train_index
    train_label=train_label
    val_index=val_index
    val_label=val_label
    dp=dp
    lr=lr
    wd=wd
    ed=ed

    config = {
        "model_name": "GCN",
        "num_layers": 2,
        "dropout": dp,
        "learning_rate": lr,
        "weight_decay": wd,
        "edge_dropout": ed,
    }

    config = edict(config)
    # 使用CPU
    # place = fluid.CPUPlace()

# 使用GPU
place = fluid.CUDAPlace(0)

    train_program = fluid.default_main_program()
    startup_program = fluid.default_startup_program()
    with fluid.program_guard(train_program, startup_program):
        with fluid.unique_name.guard():
            gw, loss, acc, pred = build_model(dataset,
                                config=config,
                                phase="train",
                                main_prog=train_program)

    test_program = fluid.Program()
    with fluid.program_guard(test_program, startup_program):
        with fluid.unique_name.guard():
            _gw, v_loss, v_acc, v_pred = build_model(dataset,
                config=config,
                phase="test",
                main_prog=test_program)

test_program = test_program.clone(for_test=True)

    exe = fluid.Executor(place)
    epoch = num
    exe.run(startup_program)

# 将图数据变成 feed_dict 用于传入Paddle Excecutor
feed_dict = gw.to_feed(dataset.graph)

    for epoch in range(epoch):
        # Full Batch 训练
        # 设定图上面那些节点要获取
        # node_index: 训练节点的nid
        # node_label: 训练节点对应的标签
        feed_dict["node_index"] = np.array(train_index, dtype="int64")
        feed_dict["node_label"] = np.array(train_label, dtype="int64")

        train_loss, train_acc = exe.run(train_program,
                                    feed=feed_dict,
                                    fetch_list=[loss.name, acc.name],
                                    return_numpy=True)

        # Full Batch 验证
        # 设定图上面那些节点要获取
        # node_index: 训练节点的nid
        # node_label: 训练节点对应的标签
        feed_dict["node_index"] = np.array(val_index, dtype="int64")
        feed_dict["node_label"] = np.array(val_label, dtype="int64")
        val_loss, val_acc = exe.run(test_program,
                                feed=feed_dict,
                                fetch_list=[v_loss.name, v_acc.name],
                                return_numpy=True)
        print("Epoch", epoch, "Train Acc", train_acc[0], "Valid Acc", val_acc[0])
    result=(-1)*val_acc[0]
    return result
a=nn(dataset,train_index,train_label,val_index,val_label,dp=0.4,lr=0.001,wd=0.0005,ed=0,num=10)

第一次运行正常

第二次运行报错

！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！

---------------------------------------------------------------------------EnforceNotMet Traceback (most recent call last) in
95 result=(-1)*val_acc[0]
96 return result
---> 97 a=nn(dataset,train_index,train_label,val_index,val_label,dp=0.4,lr=0.001,wd=0.0005,ed=0,num=10)
in nn(dataset, train_index, train_label, val_index, val_label, dp, lr, wd, ed, num)
80 feed=feed_dict,
81 fetch_list=[loss.name, acc.name],
---> 82 return_numpy=True)
83
84 # Full Batch 验证
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1069 warnings.warn(
1070 "The following exception is not an EOF exception.")
-> 1071 six.reraise(*sys.exc_info())
1072
1073 def _run_impl(self, program, feed, fetch_list, feed_var_name,
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/six.py in reraise(tp, value, tb)
701 if value.__traceback__ is not tb:
702 raise value.with_traceback(tb)
--> 703 raise value
704 finally:
705 value = None
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1064 use_program_cache=use_program_cache,
1065 use_prune=use_prune,
-> 1066 return_merged=return_merged)
1067 except Exception as e:
1068 if not isinstance(e, core.EOFException):
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in _run_impl(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1152 scope=scope,
1153 return_numpy=return_numpy,
-> 1154 use_program_cache=use_program_cache)
1155
1156 program._compile(scope, self.place)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in _run_program(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache)
1227 if not use_program_cache:
1228 self._default_executor.run(program.desc, scope, 0, True, True,
-> 1229 fetch_var_name)
1230 else:
1231 self._default_executor.run_prepared_ctx(ctx, scope, False, False,
EnforceNotMet:

--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0 std::string paddle::platform::GetTraceBackString(std::string&&, char const*, int)
1 paddle::platform::EnforceNotMet::EnforceNotMet(paddle::platform::ErrorSummary const&, char const*, int)
2 paddle::framework::OperatorWithKernel::ChooseKernel(paddle::framework::RuntimeContext const&, paddle::framework::Scope const&, paddle::platform::Place const&) const
3 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&, paddle::framework::RuntimeContext*) const
4 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&) const
5 paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, paddle::platform::Place const&)
6 paddle::framework::Executor::RunPartialPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, long, long, bool, bool, bool)
7 paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool)
8 paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector > const&, bool, bool)

----------------------
Error Message Summary:
----------------------
Error: op sum does not have kernel for data_type[bool]:data_layout[ANY_LAYOUT]:place[CUDAPlace(0)]:library_type[PLAIN] at (/paddle/paddle/fluid/framework/operator.cc:1081)
[operator < sum > error]

全部评论(1)

AIStudio810258

#2 回复于2021-01

可能因为资源耗尽，也可能因为部分代码逻辑上不能重复执行吧

提issue

需求/bug反馈？一键提issue告诉我们

提pr

发现bug？如果您知道修复办法，欢迎提pr直接参与建设飞桨~