首页 Paddle框架 帖子详情
dygraph 显存问题
收藏
快速回复
Paddle框架 问答深度学习 1016 5
dygraph 显存问题
收藏
快速回复
Paddle框架 问答深度学习 1016 5
  • 环境信息
       1)PaddlePaddle版本:paddle 1.5
  • 训练信息
       1)单机,单卡
       2)显存信息
  • 复现信息:模型部分代码大致如下:
# -*- coding: utf-8 -*-  
import paddle.fluid as fluid
import numpy as np


class ConvBnLayer(fluid.dygraph.Layer):
    def __init__(self, name_scope, num_filters, filter_size,
                 stride=1, groups=1, act=None):
        super(ConvBnLayer, self).__init__(name_scope)

        self.conv2d = fluid.dygraph.Conv2D('conv2d', num_filters=num_filters, filter_size=filter_size,
                                           stride=stride, padding=(filter_size - 1) // 2,
                                           groups=groups, bias_attr=False,
                                           param_attr=fluid.ParamAttr(name="weights"))
        self.batch_norm = fluid.dygraph.BatchNorm(self.full_name(), num_filters, act=act)

    def forward(self, inputs):
        out = self.conv2d(inputs)
        out = self.batch_norm(out)

        return out


class ShortCut(fluid.dygraph.Layer):
    def __init__(self, name_scope, ch_out, stride):
        super(ShortCut, self).__init__(name_scope)

        self.ch_out = ch_out
        self.stride = stride
        self.conv = ConvBnLayer(self.full_name(), ch_out, 1, stride)

    def forward(self, inputs):
        ch_in = inputs.shape[1]
        if ch_in != self.ch_out or self.stride != 1:
            return self.conv(inputs)
        else:
            return inputs


class BottleneckBlock(fluid.dygraph.Layer):
    def __init__(self, name_scope, num_filters, stride):
        super(BottleneckBlock, self).__init__(name_scope)

        self.conv0 = ConvBnLayer(self.full_name(), num_filters,
                                 filter_size=1,
                                 act='relu')
        self.conv1 = ConvBnLayer(self.full_name(), num_filters, filter_size=3,
                                 stride=stride, act='relu')
        self.conv2 = ConvBnLayer(self.full_name(), num_filters * 4, filter_size=1,
                                 act=None)
        self.short = ShortCut(self.full_name(), num_filters * 4, stride)

    def forward(self, inputs):
        out = self.conv0(inputs)
        out = self.conv1(out)
        out = self.conv2(out)

        short = self.short(inputs)

        return fluid.layers.elementwise_add(short, out, act='relu')


class DecoderBlock(fluid.dygraph.Layer):
    def __init__(self, name_scope, num_filters):
        super(DecoderBlock, self).__init__(name_scope)

        self.dimension_reduction = ConvBnLayer(self.full_name(), num_filters // 2,
                                               filter_size=1, act='relu')

        self.conv1 = ConvBnLayer(self.full_name(), num_filters // 2,
                                 filter_size=3, stride=1, act='relu')

        self.conv2 = ConvBnLayer(self.full_name(), num_filters // 2,
                                 filter_size=3, stride=1, act='relu')

    def forward(self, inputs, feature_map):
        out = self.dimension_reduction(inputs)
        b, c, w, h = out.shape

        # 对out上采样
        out = fluid.layers.resize_bilinear(out, out_shape=[w * 2, h * 2])
        # 和feature_map拼接
        out = fluid.layers.concat([out, feature_map], axis=1)

        out = self.conv1(out)
        out = self.conv2(out)
        return out


class Decoder(fluid.dygraph.Layer):
    def __init__(self, name_scope):
        super(Decoder, self).__init__(name_scope)

        self.decode_1 = DecoderBlock(self.full_name(), 2048)
        self.decode_2 = DecoderBlock(self.full_name(), 1024)
        self.decode_3 = DecoderBlock(self.full_name(), 512)
        self.decode_4 = ConvBnLayer(self.full_name(), 3, 1)

    def forward(self, inputs, feature_map):
        out = self.decode_1(inputs, feature_map[2])
        out = self.decode_2(out, feature_map[1])
        out = self.decode_3(out, feature_map[0])
        out = self.decode_4(out)
        return out


class DisResNet(fluid.dygraph.Layer):
    def __init__(self, name_scope, layers):
        super(DisResNet, self).__init__(name_scope)

        self.layers = layers
        support_layers = [50, 101, 152]
        assert layers in support_layers, \
            "supported layers are {} but input layer is {}".format(support_layers, layers)

        if layers == 50:
            depth = [3, 4, 6, 3]
        elif layers == 101:
            depth = [1, 4, 23, 3]
        else:
            depth = [3, 8, 36, 3]
        num_filters = [64, 128, 256, 512]

        self.bottleneck_deep_list = []
        for block in range(len(depth)):
            bottleneck_block_list = []
            for i in range(depth[block]):
                bottleneck_block = BottleneckBlock(self.full_name(),
                                                   num_filters=num_filters[block],
                                                   stride=2 if i == 0 and block != 0 else 1)
                bottleneck_block_list.append(bottleneck_block)
            self.bottleneck_deep_list.append(bottleneck_block_list)

        self.decoder = Decoder(self.full_name())
        self.feature_map = []

    def forward(self, inputs):
        out = inputs

        for bottleneck_block_list in self.bottleneck_deep_list:
            for bottleneck_block in bottleneck_block_list:
                out = bottleneck_block(out)
            self.feature_map.append(out)

        out = self.decoder(out, self.feature_map)

        return out


if __name__ == '__main__':
    with fluid.dygraph.guard():
        seresnext = DisResNet('seresnext', 50)
        img = np.zeros([2, 3, 224, 224]).astype('float32')
        label = np.zeros([2, 3, 224, 224]).astype('float32')
        gt_box = [[30, 30], [60, 60]]
        local_label = label[:, :, gt_box[0][0]:gt_box[1][0], gt_box[0][1]:gt_box[1][1]]
        img = fluid.dygraph.to_variable(img)
        label = fluid.dygraph.to_variable(label)
        local_label = fluid.dygraph.to_variable(local_label)
        outs = seresnext(img)

        local_out = outs[:, :, gt_box[0][0]:gt_box[1][0], gt_box[0][1]:gt_box[1][1]]
        loss = fluid.layers.square_error_cost(outs, label)
        local_loss = fluid.layers.square_error_cost(local_out, local_label)
        mean_loss = fluid.layers.mean(loss)
        mean_local_loss = fluid.layers.mean(local_loss)

        total_loss = 0.7 * mean_local_loss + 0.3 * mean_loss
        total_loss.backward()
        print(total_loss)
  • 问题描述:训练中,每个step后显存都会增加,最后由于显存不够,程序终止
    image
0
收藏
回复
全部评论(5)
时间顺序
AIStudio784460
#2 回复于2020-01

我看了一下代码,在这一段
for bottleneck_block_list in self.bottleneck_deep_list: for bottleneck_block in bottleneck_block_list: out = bottleneck_block(out) self.feature_map.append(out)
每执行一次最外层循环就会多一个Variable和一个dict,是不是self.feature_map.append(out) 这个原因?

0
回复
AIStudio789739
#3 回复于2020-01
@DDDivano

好的 我调试一下看看

0
回复
AIStudio789739
#4 回复于2020-01

确实是这个问题,感谢感谢

0
回复
surlrise
#5 回复于2020-05
确实是这个问题,感谢感谢

您好,这个问题您最后具体是如何解决的呢?我也遇到了类似的问题

0
回复
自尊心3
#6 回复于2020-05
您好,这个问题您最后具体是如何解决的呢?我也遇到了类似的问题

应该是重复存储变量了,导致内存溢出

0
回复
需求/bug反馈?一键提issue告诉我们
发现bug?如果您知道修复办法,欢迎提pr直接参与建设飞桨~
在@后输入用户全名并按空格结束,可艾特全站任一用户