hi 我将feed数据 拷贝到了一个list中,发现还是有问题
place = fluid.CUDAPlace(0)
device_num = fluid.core.get_cuda_device_count()
with fluid.program_guard(train_program, train_startup_program):
with fluid.unique_name.guard():
instance = DoubleTower()
instance.forward()
feed_data = [
instance.input_src_ids,
instance.input_txt_ids,
instance.input_pos_ids,
instance.input_mask,
instance.input_image,
instance.input_soft_label]
feed_list = [feed_data] * device_num
train_loss = instance.loss
fluid.optimizer.Adam(learning_rate=0.001).minimize(train_loss)
train_reader = fluid.io.PyReader(
feed_list=feed_list,
capacity=10,
iterable=True)
train_reader.decorate_batch_generator(train_batch_gen, places=place)
又出现了错误
File "distill_ernie_asyn.py", line 153, in train
train_reader.decorate_batch_generator(train_batch_gen, places=place)
File "/home/work/lixiaokang04/tools/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/reader.py", line 646, in decorate_batch_generator
self._init_iterable(places)
File "/home/work/lixiaokang04/tools/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/reader.py", line 217, in _init_iterable
self._var_names = [v.name for v in self._feed_list]
AttributeError: 'list' object has no attribute 'name'
按照你说的, 我的理解是 如果是四卡的情况,则将feed_list转变为 以下的形式是吗?
[
[src_ids, txt_ids, pos_ids, mask, image, label],
[src_ids, txt_ids, pos_ids, mask, image, label],
[src_ids, txt_ids, pos_ids, mask, image, label],
[src_ids, txt_ids, pos_ids, mask, image, label]
]
这个list中的每一个元素list内容都相同?
places = fluid.cuda_places()
train_reader.decorate_batch_generator(dev_batch_gen, places=places)
这样写试下呢?
hi baiyfbupt 使用places = fluid.cuda_places() 之后出现了如下错误
Traceback (most recent call last):
File "distill_ernie_asyn.py", line 381, in <module>
train()
File "distill_ernie_asyn.py", line 235, in train
exe = fluid.Executor(place)
File "/home/work/lixiaokang04/tools/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/executor.py", line 368, in __init__
p.set_place(self.place)
TypeError: set_place(): incompatible function arguments. The following argument types are supported:
1. (self: paddle.fluid.core_avx.Place, arg0: paddle.fluid.core_avx.Place) -> None
2. (self: paddle.fluid.core_avx.Place, arg0: paddle.fluid.core_avx.CPUPlace) -> None
3. (self: paddle.fluid.core_avx.Place, arg0: paddle.fluid.core_avx.CUDAPlace) -> None
4. (self: paddle.fluid.core_avx.Place, arg0: paddle.fluid.core_avx.CUDAPinnedPlace) -> None
Invoked with: <paddle.fluid.core_avx.Place object at 0x7f4f7369fca8>, [<paddle.fluid.core_avx.CUDAPlace object at 0x7f4f78639340>, <paddle.fluid.core_avx.CUDAPlace object at 0x7f4f78639308>]
ValueError: Feed a list of tensor, the list should be the same size as places
这个问题出现的原因是executor运行的place和reader feed的place不符,比如你executor是在多卡上运行的,但是reader只按单卡进行了feed,所以会出错
export CUDA_VISIBLE_DEVICES=4,5
export FLAGS_enable_parallel_graph=1
export FLAGS_sync_nccl_allreduce=1
export FLAGS_fraction_of_gpu_memory_to_use=0.1
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_fast_eager_deletion_mode=1
place = fluid.CUDAPlace(0)
train_reader = fluid.io.PyReader(
feed_list=[
instance.input_src_ids,
instance.input_txt_ids,
instance.input_pos_ids,
instance.input_mask,
instance.input_image,
instance.input_soft_label],
capacity=10,
iterable=True)
train_reader.decorate_batch_generator(dev_batch_gen, places=place)
最开始这个代码出问题大概率是因为用的是多卡的exe, 但是decorate_batch_generator的places设定为了单卡的place,导致输入数据份数不符,所以建议还是在原始版本的代码上把train_reader.decorate_batch_generator(dev_batch_gen, places=place)
这里的place改成fluid.cuda_places()
试下
我的exe是这么设置的
place = fluid.cuda_places() if config_use_cuda else fluid.CPUPlace()
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = fluid.core.get_cuda_device_count()
exec_strategy.num_iteration_per_drop_scope = 100
build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = True
build_strategy.memory_optimize = True
build_strategy.remove_unnecessary_lock = False
train_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=train_program,
loss_name=train_loss.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
错误还是一样的
Traceback (most recent call last):
File "distill_ernie_asyn.py", line 356, in <module>
train()
File "distill_ernie_asyn.py", line 210, in train
exe = fluid.Executor(place)
File "/home/work/lixiaokang04/tools/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/executor.py", line 368, in __init__
p.set_place(self.place)
TypeError: set_place(): incompatible function arguments. The following argument types are supported:
1. (self: paddle.fluid.core_avx.Place, arg0: paddle.fluid.core_avx.Place) -> None
2. (self: paddle.fluid.core_avx.Place, arg0: paddle.fluid.core_avx.CPUPlace) -> None
3. (self: paddle.fluid.core_avx.Place, arg0: paddle.fluid.core_avx.CUDAPlace) -> None
4. (self: paddle.fluid.core_avx.Place, arg0: paddle.fluid.core_avx.CUDAPinnedPlace) -> None
Invoked with: <paddle.fluid.core_avx.Place object at 0x7f74df52b998>, [<paddle.fluid.core_avx.CUDAPlace object at 0x7f74e73103e8>]
经过线下沟通,将上述代码改成以下可正常运行
places = fluid.cuda_places() if config_use_cuda else fluid.CPUPlace()
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(train_startup_program) # 初始化只能使用单卡
train_reader = fluid.io.PyReader(
feed_list=[
instance.input_src_ids,
instance.input_txt_ids,
instance.input_pos_ids,
instance.input_mask,
instance.input_image,
instance.input_soft_label],
capacity=10,
iterable=True)
train_reader.decorate_batch_generator(dev_batch_gen, places=places) # 此处参数需要设置为cuda_places
paddle 版本 1.5.0
cuda 9.0
cudnn 7.0
使用io.Pyreader 在单机多卡情况下出现错误
报错如下