dygraph 显存问题

环境信息
1）PaddlePaddle版本：paddle 1.5
训练信息
1）单机，单卡
2）显存信息
复现信息：模型部分代码大致如下：
# -*- coding: utf-8 -*-  
import paddle.fluid as fluid
import numpy as np


class ConvBnLayer(fluid.dygraph.Layer):
    def __init__(self, name_scope, num_filters, filter_size,
                 stride=1, groups=1, act=None):
        super(ConvBnLayer, self).__init__(name_scope)

        self.conv2d = fluid.dygraph.Conv2D('conv2d', num_filters=num_filters, filter_size=filter_size,
                                           stride=stride, padding=(filter_size - 1) // 2,
                                           groups=groups, bias_attr=False,
                                           param_attr=fluid.ParamAttr(name="weights"))
        self.batch_norm = fluid.dygraph.BatchNorm(self.full_name(), num_filters, act=act)

    def forward(self, inputs):
        out = self.conv2d(inputs)
        out = self.batch_norm(out)

        return out


class ShortCut(fluid.dygraph.Layer):
    def __init__(self, name_scope, ch_out, stride):
        super(ShortCut, self).__init__(name_scope)

        self.ch_out = ch_out
        self.stride = stride
        self.conv = ConvBnLayer(self.full_name(), ch_out, 1, stride)

    def forward(self, inputs):
        ch_in = inputs.shape[1]
        if ch_in != self.ch_out or self.stride != 1:
            return self.conv(inputs)
        else:
            return inputs


class BottleneckBlock(fluid.dygraph.Layer):
    def __init__(self, name_scope, num_filters, stride):
        super(BottleneckBlock, self).__init__(name_scope)

        self.conv0 = ConvBnLayer(self.full_name(), num_filters,
                                 filter_size=1,
                                 act='relu')
        self.conv1 = ConvBnLayer(self.full_name(), num_filters, filter_size=3,
                                 stride=stride, act='relu')
        self.conv2 = ConvBnLayer(self.full_name(), num_filters * 4, filter_size=1,
                                 act=None)
        self.short = ShortCut(self.full_name(), num_filters * 4, stride)

    def forward(self, inputs):
        out = self.conv0(inputs)
        out = self.conv1(out)
        out = self.conv2(out)

        short = self.short(inputs)

        return fluid.layers.elementwise_add(short, out, act='relu')


class DecoderBlock(fluid.dygraph.Layer):
    def __init__(self, name_scope, num_filters):
        super(DecoderBlock, self).__init__(name_scope)

        self.dimension_reduction = ConvBnLayer(self.full_name(), num_filters // 2,
                                               filter_size=1, act='relu')

        self.conv1 = ConvBnLayer(self.full_name(), num_filters // 2,
                                 filter_size=3, stride=1, act='relu')

        self.conv2 = ConvBnLayer(self.full_name(), num_filters // 2,
                                 filter_size=3, stride=1, act='relu')

    def forward(self, inputs, feature_map):
        out = self.dimension_reduction(inputs)
        b, c, w, h = out.shape

        # 对out上采样
        out = fluid.layers.resize_bilinear(out, out_shape=[w * 2, h * 2])
        # 和feature_map拼接
        out = fluid.layers.concat([out, feature_map], axis=1)

        out = self.conv1(out)
        out = self.conv2(out)
        return out


class Decoder(fluid.dygraph.Layer):
    def __init__(self, name_scope):
        super(Decoder, self).__init__(name_scope)

        self.decode_1 = DecoderBlock(self.full_name(), 2048)
        self.decode_2 = DecoderBlock(self.full_name(), 1024)
        self.decode_3 = DecoderBlock(self.full_name(), 512)
        self.decode_4 = ConvBnLayer(self.full_name(), 3, 1)

    def forward(self, inputs, feature_map):
        out = self.decode_1(inputs, feature_map[2])
        out = self.decode_2(out, feature_map[1])
        out = self.decode_3(out, feature_map[0])
        out = self.decode_4(out)
        return out


class DisResNet(fluid.dygraph.Layer):
    def __init__(self, name_scope, layers):
        super(DisResNet, self).__init__(name_scope)

        self.layers = layers
        support_layers = [50, 101, 152]
        assert layers in support_layers, \
            "supported layers are {} but input layer is {}".format(support_layers, layers)

        if layers == 50:
            depth = [3, 4, 6, 3]
        elif layers == 101:
            depth = [1, 4, 23, 3]
        else:
            depth = [3, 8, 36, 3]
        num_filters = [64, 128, 256, 512]

        self.bottleneck_deep_list = []
        for block in range(len(depth)):
            bottleneck_block_list = []
            for i in range(depth[block]):
                bottleneck_block = BottleneckBlock(self.full_name(),
                                                   num_filters=num_filters[block],
                                                   stride=2 if i == 0 and block != 0 else 1)
                bottleneck_block_list.append(bottleneck_block)
            self.bottleneck_deep_list.append(bottleneck_block_list)

        self.decoder = Decoder(self.full_name())
        self.feature_map = []

    def forward(self, inputs):
        out = inputs

        for bottleneck_block_list in self.bottleneck_deep_list:
            for bottleneck_block in bottleneck_block_list:
                out = bottleneck_block(out)
            self.feature_map.append(out)

        out = self.decoder(out, self.feature_map)

        return out


if __name__ == '__main__':
    with fluid.dygraph.guard():
        seresnext = DisResNet('seresnext', 50)
        img = np.zeros([2, 3, 224, 224]).astype('float32')
        label = np.zeros([2, 3, 224, 224]).astype('float32')
        gt_box = [[30, 30], [60, 60]]
        local_label = label[:, :, gt_box[0][0]:gt_box[1][0], gt_box[0][1]:gt_box[1][1]]
        img = fluid.dygraph.to_variable(img)
        label = fluid.dygraph.to_variable(label)
        local_label = fluid.dygraph.to_variable(local_label)
        outs = seresnext(img)

        local_out = outs[:, :, gt_box[0][0]:gt_box[1][0], gt_box[0][1]:gt_box[1][1]]
        loss = fluid.layers.square_error_cost(outs, label)
        local_loss = fluid.layers.square_error_cost(local_out, local_label)
        mean_loss = fluid.layers.mean(loss)
        mean_local_loss = fluid.layers.mean(local_loss)

        total_loss = 0.7 * mean_local_loss + 0.3 * mean_loss
        total_loss.backward()
        print(total_loss)
问题描述：训练中，每个step后显存都会增加，最后由于显存不够，程序终止