基于bert4keras,尝试ELECTRA进行
收藏
首先,最近出来了很多优秀的预训练语言模型的工作。谷歌的ELECTRA还有fastNLP。国内的一些开源组织,也进行了中文预训练任务的复现。
所以比较秀的是,我们可以根据更多的预训练给出一个任务在不同任务下表现的结果。然后将结果融合到一起。从而提高我们在比赛中的成绩。其实代码变动也不是很大。也就是下面的build_transformer_model换了一下。
#! -*- coding:utf-8 -*- # 三元组抽取任务,基于“半指针-半标注”结构 # 文章介绍:https://kexue.fm/archives/7161 # 数据集:http://ai.baidu.com/broad/download?dataset=sked # 最优f1=0.82198 # 换用RoBERTa Large可以达到f1=0.829+ import json import numpy as np from bert4keras.backend import keras, K, batch_gather from bert4keras.layers import LayerNormalization from bert4keras.models import build_transformer_model from bert4keras.optimizers import Adam, ExponentialMovingAverage from bert4keras.snippets import open from bert4keras.snippets import sequence_padding, DataGenerator from bert4keras.tokenizers import Tokenizer from keras.callbacks import TensorBoard from keras.layers import Input, Dense, Lambda, Reshape from keras.models import Model from tqdm import tqdm config_path = '../electra_tiny/bert_config_tiny.json' checkpoint_path = '../electra_tiny/model.ckpt-1000000' dict_path = '../electra_tiny/vocab.txt' maxlen = 128 batch_size = 64 def load_data(filename): D = [] with open(filename, encoding='utf-8') as f: for l in f: l = json.loads(l) D.append({ 'text': l['text'], 'spo_list': [ (spo['subject'], spo['predicate'], spo['object']) for spo in l['spo_list'] ] }) return D # 加载数据集 train_data = load_data('../data/baidu_relation_extraction/train_data.json') valid_data = load_data('../data/baidu_relation_extraction/dev_data.json') predicate2id, id2predicate = {}, {} with open('../data/baidu_relation_extraction/all_50_schemas') as f: for l in f: l = json.loads(l) if l['predicate'] not in predicate2id: id2predicate[len(predicate2id)] = l['predicate'] predicate2id[l['predicate']] = len(predicate2id) # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) def search(pattern, sequence): """从sequence中寻找子串pattern 如果找到,返回第一个下标;否则返回-1。 """ n = len(pattern) for i in range(len(sequence)): if sequence[i:i + n] == pattern: return i return -1 class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids = [], [] batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [] for is_end, d in self.sample(random): token_ids, segment_ids = tokenizer.encode(d['text'], max_length=maxlen) # 整理三元组 {s: [(o, p)]} spoes = {} for s, p, o in d['spo_list']: s = tokenizer.encode(s)[0][1:-1] p = predicate2id[p] o = tokenizer.encode(o)[0][1:-1] s_idx = search(s, token_ids) o_idx = search(o, token_ids) if s_idx != -1 and o_idx != -1: s = (s_idx, s_idx + len(s) - 1) o = (o_idx, o_idx + len(o) - 1, p) if s not in spoes: spoes[s] = [] spoes[s].append(o) if spoes: # subject标签 subject_labels = np.zeros((len(token_ids), 2)) for s in spoes: subject_labels[s[0], 0] = 1 subject_labels[s[1], 1] = 1 # 随机选一个subject start, end = np.array(list(spoes.keys())).T start = np.random.choice(start) end = np.random.choice(end[end >= start]) subject_ids = (start, end) # 对应的object标签 object_labels = np.zeros((len(token_ids), len(predicate2id), 2)) for o in spoes.get(subject_ids, []): object_labels[o[0], o[2], 0] = 1 object_labels[o[1], o[2], 1] = 1 # 构建batch batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_subject_labels.append(subject_labels) batch_subject_ids.append(subject_ids) batch_object_labels.append(object_labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_subject_labels = sequence_padding(batch_subject_labels, padding=np.zeros(2)) batch_subject_ids = np.array(batch_subject_ids) batch_object_labels = sequence_padding(batch_object_labels, padding=np.zeros((len(predicate2id), 2))) yield [ batch_token_ids, batch_segment_ids, batch_subject_labels, batch_subject_ids, batch_object_labels ], None batch_token_ids, batch_segment_ids = [], [] batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [] def extrac_subject(inputs): """根据subject_ids从output中取出subject的向量表征 """ output, subject_ids = inputs subject_ids = K.cast(subject_ids, 'int32') start = batch_gather(output, subject_ids[:, :1]) end = batch_gather(output, subject_ids[:, 1:]) subject = K.concatenate([start, end], 2) return subject[:, 0] # 补充输入 subject_labels = Input(shape=(None, 2), name='Subject-Labels') subject_ids = Input(shape=(2,), name='Subject-Ids') object_labels = Input(shape=(None, len(predicate2id), 2), name='Object-Labels') # 加载预训练模型 bert = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, model="electra", return_keras_model=False, ) # 预测subject output = Dense(units=2, activation='sigmoid', kernel_initializer=bert.initializer)(bert.model.output) subject_preds = Lambda(lambda x: x ** 2)(output) subject_model = Model(bert.model.inputs, subject_preds) # 传入subject,预测object # 通过Conditional Layer Normalization将subject融入到object的预测中 output = bert.model.layers[-2].get_output_at(-1) subject = Lambda(extrac_subject)([output, subject_ids]) output = LayerNormalization(conditional=True)([output, subject]) output = Dense(units=len(predicate2id) * 2, activation='sigmoid', kernel_initializer=bert.initializer)(output) output = Lambda(lambda x: x ** 4)(output) object_preds = Reshape((-1, len(predicate2id), 2))(output) object_model = Model(bert.model.inputs + [subject_ids], object_preds) # 训练模型 train_model = Model(bert.model.inputs + [subject_labels, subject_ids, object_labels], [subject_preds, object_preds]) mask = bert.model.get_layer('Embedding-Token').output_mask mask = K.cast(mask, K.floatx()) subject_loss = K.binary_crossentropy(subject_labels, subject_preds) subject_loss = K.mean(subject_loss, 2) subject_loss = K.sum(subject_loss * mask) / K.sum(mask) object_loss = K.binary_crossentropy(object_labels, object_preds) object_loss = K.sum(K.mean(object_loss, 3), 2) object_loss = K.sum(object_loss * mask) / K.sum(mask) train_model.add_loss(subject_loss + object_loss) train_model.compile(optimizer=Adam(1e-5)) def extract_spoes(text): """抽取输入text所包含的三元组 """ tokens = tokenizer.tokenize(text, max_length=maxlen) token_ids, segment_ids = tokenizer.encode(text, max_length=maxlen) # 抽取subject subject_preds = subject_model.predict([[token_ids], [segment_ids]]) start = np.where(subject_preds[0, :, 0] > 0.6)[0] end = np.where(subject_preds[0, :, 1] > 0.5)[0] subjects = [] for i in start: j = end[end >= i] if len(j) > 0: j = j[0] subjects.append((i, j)) if subjects: spoes = [] token_ids = np.repeat([token_ids], len(subjects), 0) segment_ids = np.repeat([segment_ids], len(subjects), 0) subjects = np.array(subjects) # 传入subject,抽取object和predicate object_preds = object_model.predict([token_ids, segment_ids, subjects]) for subject, object_pred in zip(subjects, object_preds): start = np.where(object_pred[:, :, 0] > 0.6) end = np.where(object_pred[:, :, 1] > 0.5) for _start, predicate1 in zip(*start): for _end, predicate2 in zip(*end): if _start <= _end and predicate1 == predicate2: spoes.append((subject, predicate1, (_start, _end))) break return [ ( tokenizer.decode(token_ids[0, s[0]:s[1] + 1], tokens[s[0]:s[1] + 1]), id2predicate[p], tokenizer.decode(token_ids[0, o[0]:o[1] + 1], tokens[o[0]:o[1] + 1]) ) for s, p, o in spoes ] else: return [] class SPO(tuple): """用来存三元组的类 表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法, 使得在判断两个三元组是否等价时容错性更好。 """ def __init__(self, spo): self.spox = ( tuple(tokenizer.tokenize(spo[0])), spo[1], tuple(tokenizer.tokenize(spo[2])), ) def __hash__(self): return self.spox.__hash__() def __eq__(self, spo): return self.spox == spo.spox def evaluate(data): """评估函数,计算f1、precision、recall """ X, Y, Z = 1e-10, 1e-10, 1e-10 f1, precision, recall = 0.0, 0.0, 0.0 f = open('dev_pred.json', 'w', encoding='utf-8') pbar = tqdm() for d in data: R = set([SPO(spo) for spo in extract_spoes(d['text'])]) T = set([SPO(spo) for spo in d['spo_list']]) X += len(R & T) Y += len(R) Z += len(T) f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z pbar.update() pbar.set_description('f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall)) s = json.dumps( { 'text': d['text'], 'spo_list': list(T), 'spo_list_pred': list(R), 'new': list(R - T), 'lack': list(T - R), }, ensure_ascii=False, indent=4) f.write(s + '\n') pbar.close() f.close() return f1, precision, recall class Evaluator(keras.callbacks.Callback): """评估和保存模型 """ def __init__(self): self.best_val_f1 = 0. def on_epoch_end(self, epoch, logs=None): EMAer.apply_ema_weights() f1, precision, recall = evaluate(valid_data) if f1 >= self.best_val_f1: self.best_val_f1 = f1 train_model.save_weights('best_model.weights') EMAer.reset_old_weights() print('f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % (f1, precision, recall, self.best_val_f1)) if __name__ == '__main__': train_generator = data_generator(train_data, batch_size) evaluator = Evaluator() EMAer = ExponentialMovingAverage(0.999) tf_board_callback = TensorBoard(log_dir='../ner_tf_dir', update_freq=10) train_model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=20, callbacks=[evaluator, EMAer,tf_board_callback]) else: train_model.load_weights('best_model.weights')
然后下面是我在我本地机器上面训练的过程部分的截图。感觉还是不错的。
2705/2705 [==============================] - 221s 82ms/step - loss: 0.4439 f1: 0.00000, precision: 1.00000, recall: 0.00000: : 21639it [00:53, 406.05it/s] f1: 0.00000, precision: 1.00000, recall: 0.00000, best f1: 0.00000 Epoch 2/20 2705/2705 [==============================] - 221s 82ms/step - loss: 0.1126 f1: 0.57437, precision: 0.81871, recall: 0.44235: : 21639it [01:39, 217.19it/s] f1: 0.57437, precision: 0.81871, recall: 0.44235, best f1: 0.57437 Epoch 3/20 2705/2705 [==============================] - 220s 81ms/step - loss: 0.0785 f1: 0.67632, precision: 0.77211, recall: 0.60168: : 21639it [01:43, 209.86it/s] f1: 0.67632, precision: 0.77211, recall: 0.60168, best f1: 0.67632 Epoch 4/20 2705/2705 [==============================] - 220s 81ms/step - loss: 0.0671 f1: 0.70757, precision: 0.77068, recall: 0.65402: : 21639it [01:43, 208.13it/s] f1: 0.70757, precision: 0.77068, recall: 0.65402, best f1: 0.70757
#! -*- coding:utf-8 -*- # 三元组抽取任务,基于“半指针-半标注”结构 # 文章介绍:https://kexue.fm/archives/7161 # 数据集:http://ai.baidu.com/broad/download?dataset=sked # 最优f1=0.82198 # 换用RoBERTa Large可以达到f1=0.829+ # 欢迎大家加我的微信15246115202 import json import numpy as np from bert4keras.backend import keras, K, batch_gather from bert4keras.layers import LayerNormalization from bert4keras.models import build_transformer_model from bert4keras.optimizers import Adam, ExponentialMovingAverage from bert4keras.snippets import open from bert4keras.snippets import sequence_padding, DataGenerator from bert4keras.tokenizers import Tokenizer from keras.callbacks import TensorBoard from keras.layers import Input, Dense, Lambda, Reshape from keras.models import Model from tqdm import tqdm config_path = '../electra_tiny/bert_config_tiny.json' checkpoint_path = '../electra_tiny/model.ckpt-1000000' dict_path = '../electra_tiny/vocab.txt' maxlen = 128 batch_size = 64 def load_data(filename): D = [] with open(filename, encoding='utf-8') as f: for l in f: l = json.loads(l) D.append({ 'text': l['text'], 'spo_list': [ (spo['subject'], spo['predicate'], spo['object']) for spo in l['spo_list'] ] }) return D # 加载数据集 train_data = load_data('../data/baidu_relation_extraction/train_data.json') valid_data = load_data('../data/baidu_relation_extraction/dev_data.json') predicate2id, id2predicate = {}, {} with open('../data/baidu_relation_extraction/all_50_schemas') as f: for l in f: l = json.loads(l) if l['predicate'] not in predicate2id: id2predicate[len(predicate2id)] = l['predicate'] predicate2id[l['predicate']] = len(predicate2id) # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) def search(pattern, sequence): """从sequence中寻找子串pattern 如果找到,返回第一个下标;否则返回-1。 """ n = len(pattern) for i in range(len(sequence)): if sequence[i:i + n] == pattern: return i return -1 class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids = [], [] batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [] for is_end, d in self.sample(random): token_ids, segment_ids = tokenizer.encode(d['text'], max_length=maxlen) # 整理三元组 {s: [(o, p)]} spoes = {} for s, p, o in d['spo_list']: s = tokenizer.encode(s)[0][1:-1] p = predicate2id[p] o = tokenizer.encode(o)[0][1:-1] s_idx = search(s, token_ids) o_idx = search(o, token_ids) if s_idx != -1 and o_idx != -1: s = (s_idx, s_idx + len(s) - 1) o = (o_idx, o_idx + len(o) - 1, p) if s not in spoes: spoes[s] = [] spoes[s].append(o) if spoes: # subject标签 subject_labels = np.zeros((len(token_ids), 2)) for s in spoes: subject_labels[s[0], 0] = 1 subject_labels[s[1], 1] = 1 # 随机选一个subject start, end = np.array(list(spoes.keys())).T start = np.random.choice(start) end = np.random.choice(end[end >= start]) subject_ids = (start, end) # 对应的object标签 object_labels = np.zeros((len(token_ids), len(predicate2id), 2)) for o in spoes.get(subject_ids, []): object_labels[o[0], o[2], 0] = 1 object_labels[o[1], o[2], 1] = 1 # 构建batch batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_subject_labels.append(subject_labels) batch_subject_ids.append(subject_ids) batch_object_labels.append(object_labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_subject_labels = sequence_padding(batch_subject_labels, padding=np.zeros(2)) batch_subject_ids = np.array(batch_subject_ids) batch_object_labels = sequence_padding(batch_object_labels, padding=np.zeros((len(predicate2id), 2))) yield [ batch_token_ids, batch_segment_ids, batch_subject_labels, batch_subject_ids, batch_object_labels ], None batch_token_ids, batch_segment_ids = [], [] batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [] def extrac_subject(inputs): """根据subject_ids从output中取出subject的向量表征 """ output, subject_ids = inputs subject_ids = K.cast(subject_ids, 'int32') start = batch_gather(output, subject_ids[:, :1]) end = batch_gather(output, subject_ids[:, 1:]) subject = K.concatenate([start, end], 2) return subject[:, 0] # 补充输入 subject_labels = Input(shape=(None, 2), name='Subject-Labels') subject_ids = Input(shape=(2,), name='Subject-Ids') object_labels = Input(shape=(None, len(predicate2id), 2), name='Object-Labels') # 加载预训练模型 bert = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, model="electra", return_keras_model=False, ) # 预测subject output = Dense(units=2, activation='sigmoid', kernel_initializer=bert.initializer)(bert.model.output) subject_preds = Lambda(lambda x: x ** 2)(output) subject_model = Model(bert.model.inputs, subject_preds) # 传入subject,预测object # 通过Conditional Layer Normalization将subject融入到object的预测中 output = bert.model.layers[-2].get_output_at(-1) subject = Lambda(extrac_subject)([output, subject_ids]) output = LayerNormalization(conditional=True)([output, subject]) output = Dense(units=len(predicate2id) * 2, activation='sigmoid', kernel_initializer=bert.initializer)(output) output = Lambda(lambda x: x ** 4)(output) object_preds = Reshape((-1, len(predicate2id), 2))(output) object_model = Model(bert.model.inputs + [subject_ids], object_preds) # 训练模型 train_model = Model(bert.model.inputs + [subject_labels, subject_ids, object_labels], [subject_preds, object_preds]) mask = bert.model.get_layer('Embedding-Token').output_mask mask = K.cast(mask, K.floatx()) subject_loss = K.binary_crossentropy(subject_labels, subject_preds) subject_loss = K.mean(subject_loss, 2) subject_loss = K.sum(subject_loss * mask) / K.sum(mask) object_loss = K.binary_crossentropy(object_labels, object_preds) object_loss = K.sum(K.mean(object_loss, 3), 2) object_loss = K.sum(object_loss * mask) / K.sum(mask) train_model.add_loss(subject_loss + object_loss) train_model.compile(optimizer=Adam(1e-5)) def extract_spoes(text): """抽取输入text所包含的三元组 """ tokens = tokenizer.tokenize(text, max_length=maxlen) token_ids, segment_ids = tokenizer.encode(text, max_length=maxlen) # 抽取subject subject_preds = subject_model.predict([[token_ids], [segment_ids]]) start = np.where(subject_preds[0, :, 0] > 0.6)[0] end = np.where(subject_preds[0, :, 1] > 0.5)[0] subjects = [] for i in start: j = end[end >= i] if len(j) > 0: j = j[0] subjects.append((i, j)) if subjects: spoes = [] token_ids = np.repeat([token_ids], len(subjects), 0) segment_ids = np.repeat([segment_ids], len(subjects), 0) subjects = np.array(subjects) # 传入subject,抽取object和predicate object_preds = object_model.predict([token_ids, segment_ids, subjects]) for subject, object_pred in zip(subjects, object_preds): start = np.where(object_pred[:, :, 0] > 0.6) end = np.where(object_pred[:, :, 1] > 0.5) for _start, predicate1 in zip(*start): for _end, predicate2 in zip(*end): if _start <= _end and predicate1 == predicate2: spoes.append((subject, predicate1, (_start, _end))) break return [ ( tokenizer.decode(token_ids[0, s[0]:s[1] + 1], tokens[s[0]:s[1] + 1]), id2predicate[p], tokenizer.decode(token_ids[0, o[0]:o[1] + 1], tokens[o[0]:o[1] + 1]) ) for s, p, o in spoes ] else: return [] class SPO(tuple): """用来存三元组的类 表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法, 使得在判断两个三元组是否等价时容错性更好。 """ def __init__(self, spo): self.spox = ( tuple(tokenizer.tokenize(spo[0])), spo[1], tuple(tokenizer.tokenize(spo[2])), ) def __hash__(self): return self.spox.__hash__() def __eq__(self, spo): return self.spox == spo.spox def evaluate(data): """评估函数,计算f1、precision、recall """ X, Y, Z = 1e-10, 1e-10, 1e-10 f1, precision, recall = 0.0, 0.0, 0.0 f = open('dev_pred.json', 'w', encoding='utf-8') pbar = tqdm() for d in data: R = set([SPO(spo) for spo in extract_spoes(d['text'])]) T = set([SPO(spo) for spo in d['spo_list']]) X += len(R & T) Y += len(R) Z += len(T) f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z pbar.update() pbar.set_description('f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall)) s = json.dumps( { 'text': d['text'], 'spo_list': list(T), 'spo_list_pred': list(R), 'new': list(R - T), 'lack': list(T - R), }, ensure_ascii=False, indent=4) f.write(s + '\n') pbar.close() f.close() return f1, precision, recall class Evaluator(keras.callbacks.Callback): """评估和保存模型 """ def __init__(self): self.best_val_f1 = 0. def on_epoch_end(self, epoch, logs=None): EMAer.apply_ema_weights() f1, precision, recall = evaluate(valid_data) if f1 >= self.best_val_f1: self.best_val_f1 = f1 train_model.save_weights('best_model.weights') EMAer.reset_old_weights() print('f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % (f1, precision, recall, self.best_val_f1)) if __name__ == '__main__': train_generator = data_generator(train_data, batch_size) evaluator = Evaluator() EMAer = ExponentialMovingAverage(0.999) tf_board_callback = TensorBoard(log_dir='../ner_tf_dir', update_freq=10) train_model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=20, callbacks=[evaluator, EMAer,tf_board_callback]) else: train_model.load_weights('best_model.weights')
4
收藏
请登录后评论
我导入的bert4keras里面没有ExponentialMovingAverage?请问使用的是什么版本
大佬我目前是本科生,我也看了苏剑林先生的三元组抽取的code,而我在用bert4keras跑层序的时候发现F1值一直是0(4000行训练集)。而后我输出了程序的验证集的已有关系、和它猜测的关系(2000训练集),而后发现它一个关系都没有猜测。并不是没有猜对,而是没有猜测。而后我又用1000训练集跑了一次迭代并打印了程序的猜测三元组的部分,发现程序无法抽取subject.。请问大致能整么解决?(苦笑)
bert4keras == 0.7.0
可能是CUDA不兼容或者内存不够导致