首页 Paddle框架 帖子详情
Mask-rcnn训练中报错
收藏
快速回复
Paddle框架 问答深度学习模型训练 1364 3
Mask-rcnn训练中报错
收藏
快速回复
Paddle框架 问答深度学习模型训练 1364 3

Traceback (most recent call last):
File "train.py", line 258, in
train()
File "train.py", line 112, in train
exe.run(fluid.default_startup_program())
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/executor.py", line 651, in run
use_program_cache=use_program_cache)
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/executor.py", line 749, in _run
exe.run(program.desc, scope, 0, True, True, fetch_var_name)
paddle.fluid.core_avx.EnforceNotMet: Invoke operator fill_constant error.
Python Callstacks:
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/framework.py", line 1842, in _prepend_op
attrs=kwargs.get("attrs", None))
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/initializer.py", line 189, in __call__
stop_gradient=True)
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/framework.py", line 1625, in create_var
kwargs['initializer'](var, self)
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/layer_helper_base.py", line 383, in set_variable_initializer
initializer=initializer)
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/optimizer.py", line 317, in _add_accumulator
var, initializer=Constant(value=float(fill_value)))
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/optimizer.py", line 760, in _create_accumulators
self._add_accumulator(self._velocity_acc_str, p)
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/optimizer.py", line 364, in _create_optimization_pass
[p[0] for p in parameters_and_grads])
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/optimizer.py", line 532, in apply_gradients
optimize_ops = self._create_optimization_pass(params_grads)
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/optimizer.py", line 562, in apply_optimize
optimize_ops = self.apply_gradients(params_grads)
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/optimizer.py", line 601, in minimize
loss, startup_program=startup_program, params_grads=params_grads)
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/dygraph/base.py", line 87, in __impl__
return func(*args, **kwargs)
File "/home/gpu-server2/anaconda3/envs/CNdetection/lib/python3.7/site-packages/paddle/fluid/wrapped_decorator.py", line 25, in __impl__
return wrapped_func(*args, **kwargs)
File "", line 2, in minimize
File "train.py", line 102, in train
optimizer.minimize(loss)
File "train.py", line 258, in
train()
C++ Callstacks:
Enforce failed. Expected allocating <= available, but received allocating:10725104759 > available:10702159616.
Insufficient GPU memory to allocation. at [/paddle/paddle/fluid/platform/gpu_info.cc:262]
PaddlePaddle Call Stacks:
0 0x7f6b521d2e78p void paddle::platform::EnforceNotMet::Init(std::string, char const*, int) + 360
1 0x7f6b521d31c7p paddle::platform::EnforceNotMet::EnforceNotMet(std::string const&, char const*, int) + 87
2 0x7f6b54324c66p paddle::platform::GpuMaxChunkSize() + 630
3 0x7f6b542f8f5ap
4 0x7f6b84277827p
5 0x7f6b542f85fdp paddle::memory::legacy::GetGPUBuddyAllocator(int) + 109
6 0x7f6b542f9445p void* paddle::memory::legacy::Alloc(paddle::platform::CUDAPlace const&, unsigned long) + 37
7 0x7f6b542f9985p paddle::memory::allocation::LegacyAllocator::AllocateImpl(unsigned long) + 421
8 0x7f6b542edaa5p paddle::memory::allocation::AllocatorFacade::Alloc(boost::variant const&, unsigned long) + 181
9 0x7f6b542edc2ap paddle::memory::allocation::AllocatorFacade::AllocShared(boost::variant const&, unsigned long) + 26
10 0x7f6b53ee676cp paddle::memory::AllocShared(boost::variant const&, unsigned long) + 44
11 0x7f6b542bf7f4p paddle::framework::Tensor::mutable_data(boost::variant, paddle::framework::proto::VarType_Type, unsigned long) + 148
12 0x7f6b52fe7a2ep paddle::operators::FillConstantKernel::Compute(paddle::framework::ExecutionContext const&) const + 494
13 0x7f6b52feab43p std::_Function_handler, paddle::operators::FillConstantKernel, paddle::operators::FillConstantKernel, paddle::operators::FillConstantKernel, paddle::operators::FillConstantKernel >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::_M_invoke(std::_Any_data const&, paddle::framework::ExecutionContext const&) + 35
14 0x7f6b54261057p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant const&, paddle::framework::RuntimeContext*) const + 375
15 0x7f6b54261431p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant const&) const + 529
16 0x7f6b5425ea2cp paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant const&) + 332
17 0x7f6b5235d81ep paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool) + 606
18 0x7f6b523607dfp paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector > const&, bool) + 143
19 0x7f6b521c3fddp
20 0x7f6b52205286p
21 0x55e9977a8744p _PyMethodDef_RawFastCallKeywords + 596
22 0x55e9977a8861p _PyCFunction_FastCallKeywords + 33
23 0x55e9978146e8p _PyEval_EvalFrameDefault + 21240
24 0x55e997758539p _PyEval_EvalCodeWithName + 761
25 0x55e9977a7f57p _PyFunction_FastCallKeywords + 903
26 0x55e9978108ccp _PyEval_EvalFrameDefault + 5340
27 0x55e997758539p _PyEval_EvalCodeWithName + 761
28 0x55e9977a7ef5p _PyFunction_FastCallKeywords + 805
29 0x55e99780fa93p _PyEval_EvalFrameDefault + 1699
30 0x55e997758d09p _PyEval_EvalCodeWithName + 2761
31 0x55e9977a7f57p _PyFunction_FastCallKeywords + 903
32 0x55e99780f806p _PyEval_EvalFrameDefault + 1046
33 0x55e997758539p _PyEval_EvalCodeWithName + 761
34 0x55e997759424p PyEval_EvalCodeEx + 68
35 0x55e99775944cp PyEval_EvalCode + 28
36 0x55e99786eb74p
37 0x55e997878eb1p PyRun_FileExFlags + 161
38 0x55e9978790a3p PyRun_SimpleFileExFlags + 451
39 0x55e99787a195p
40 0x55e99787a2bcp _Py_UnixMain + 60
41 0x7f6b83e98b97p __libc_start_main + 231
42 0x55e99781f062p

 

按照网上github issue 给出的方案是清除GPU缓存即可,但是我清除缓存后甚至重启电脑依然运行报错!求解

0
收藏
回复
全部评论(3)
时间顺序
thinc
#2 回复于2020-04

可以试下把输入数据size调得再小点或者设置更低的batch_size

0
回复
AIStudio810260
#3 回复于2020-04

把输入图片resize大小调小

0
回复
星光ld1
#4 回复于2020-05

@秦人南山

报错原因是显存爆炸,那解决方法无外乎降低数据批次Batch_size,降低输入的resolution即图片下采样,或者将模型架构进行调整使用更小一些的模型架构,或者使用混合精度,上述是在不更换机器配置下的解决方法

1
回复
需求/bug反馈?一键提issue告诉我们
发现bug?如果您知道修复办法,欢迎提pr直接参与建设飞桨~
在@后输入用户全名并按空格结束,可艾特全站任一用户