论文引用网络节点分类比赛的Baseline
from easydict import EasyDict as edict
import pgl
import model
import paddle.fluid as fluid
import numpy as np
import time
from build_model import build_model
def nn(dataset,train_index,train_label,val_index,val_label,dp,lr,wd,ed,num):
from easydict import EasyDict as edict
import pgl
import model
import paddle.fluid as fluid
import numpy as np
import time
from build_model import build_model
dataset=dataset
train_index=train_index
train_label=train_label
val_index=val_index
val_label=val_label
dp=dp
lr=lr
wd=wd
ed=ed
config = {
"model_name": "GCN",
"num_layers": 2,
"dropout": dp,
"learning_rate": lr,
"weight_decay": wd,
"edge_dropout": ed,
}
config = edict(config)
# 使用CPU
# place = fluid.CPUPlace()
# 使用GPU
place = fluid.CUDAPlace(0)
train_program = fluid.default_main_program()
startup_program = fluid.default_startup_program()
with fluid.program_guard(train_program, startup_program):
with fluid.unique_name.guard():
gw, loss, acc, pred = build_model(dataset,
config=config,
phase="train",
main_prog=train_program)
test_program = fluid.Program()
with fluid.program_guard(test_program, startup_program):
with fluid.unique_name.guard():
_gw, v_loss, v_acc, v_pred = build_model(dataset,
config=config,
phase="test",
main_prog=test_program)
test_program = test_program.clone(for_test=True)
exe = fluid.Executor(place)
epoch = num
exe.run(startup_program)
# 将图数据变成 feed_dict 用于传入Paddle Excecutor
feed_dict = gw.to_feed(dataset.graph)
for epoch in range(epoch):
# Full Batch 训练
# 设定图上面那些节点要获取
# node_index: 训练节点的nid
# node_label: 训练节点对应的标签
feed_dict["node_index"] = np.array(train_index, dtype="int64")
feed_dict["node_label"] = np.array(train_label, dtype="int64")
train_loss, train_acc = exe.run(train_program,
feed=feed_dict,
fetch_list=[loss.name, acc.name],
return_numpy=True)
# Full Batch 验证
# 设定图上面那些节点要获取
# node_index: 训练节点的nid
# node_label: 训练节点对应的标签
feed_dict["node_index"] = np.array(val_index, dtype="int64")
feed_dict["node_label"] = np.array(val_label, dtype="int64")
val_loss, val_acc = exe.run(test_program,
feed=feed_dict,
fetch_list=[v_loss.name, v_acc.name],
return_numpy=True)
print("Epoch", epoch, "Train Acc", train_acc[0], "Valid Acc", val_acc[0])
result=(-1)*val_acc[0]
return result
a=nn(dataset,train_index,train_label,val_index,val_label,dp=0.4,lr=0.001,wd=0.0005,ed=0,num=10)
第一次运行正常
第二次运行报错
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
---------------------------------------------------------------------------EnforceNotMet Traceback (most recent call last) in
95 result=(-1)*val_acc[0]
96 return result
---> 97 a=nn(dataset,train_index,train_label,val_index,val_label,dp=0.4,lr=0.001,wd=0.0005,ed=0,num=10)
in nn(dataset, train_index, train_label, val_index, val_label, dp, lr, wd, ed, num)
80 feed=feed_dict,
81 fetch_list=[loss.name, acc.name],
---> 82 return_numpy=True)
83
84 # Full Batch 验证
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1069 warnings.warn(
1070 "The following exception is not an EOF exception.")
-> 1071 six.reraise(*sys.exc_info())
1072
1073 def _run_impl(self, program, feed, fetch_list, feed_var_name,
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/six.py in reraise(tp, value, tb)
701 if value.__traceback__ is not tb:
702 raise value.with_traceback(tb)
--> 703 raise value
704 finally:
705 value = None
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1064 use_program_cache=use_program_cache,
1065 use_prune=use_prune,
-> 1066 return_merged=return_merged)
1067 except Exception as e:
1068 if not isinstance(e, core.EOFException):
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in _run_impl(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1152 scope=scope,
1153 return_numpy=return_numpy,
-> 1154 use_program_cache=use_program_cache)
1155
1156 program._compile(scope, self.place)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in _run_program(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache)
1227 if not use_program_cache:
1228 self._default_executor.run(program.desc, scope, 0, True, True,
-> 1229 fetch_var_name)
1230 else:
1231 self._default_executor.run_prepared_ctx(ctx, scope, False, False,
EnforceNotMet:
--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0 std::string paddle::platform::GetTraceBackString(std::string&&, char const*, int)
1 paddle::platform::EnforceNotMet::EnforceNotMet(paddle::platform::ErrorSummary const&, char const*, int)
2 paddle::framework::OperatorWithKernel::ChooseKernel(paddle::framework::RuntimeContext const&, paddle::framework::Scope const&, paddle::platform::Place const&) const
3 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&, paddle::framework::RuntimeContext*) const
4 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&) const
5 paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, paddle::platform::Place const&)
6 paddle::framework::Executor::RunPartialPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, long, long, bool, bool, bool)
7 paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool)
8 paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector > const&, bool, bool)
----------------------
Error Message Summary:
----------------------
Error: op sum does not have kernel for data_type[bool]:data_layout[ANY_LAYOUT]:place[CUDAPlace(0)]:library_type[PLAIN] at (/paddle/paddle/fluid/framework/operator.cc:1081)
[operator < sum > error]
可能因为资源耗尽,也可能因为部分代码逻辑上不能重复执行吧