代码拉取完成,页面将自动刷新
#!/usr/bin/env python
# -*- coding: utf8 -*-
# @Date : 2020/11/12
# @Author : mingming.xu
# @Email : xv44586@gmail.com
"""
bert每层捕获的信息不同,代表的语义粒度也不同,将不同粒度的信息拼接起来,然后送进CNN后做分类。
ret:
https://arxiv.org/pdf/2008.06460.pdf
"""
import os
from tqdm import tqdm
import numpy as np
from toolkit4nlp.utils import *
from toolkit4nlp.models import *
from toolkit4nlp.layers import *
from toolkit4nlp.optimizers import *
from toolkit4nlp.tokenizers import Tokenizer
from toolkit4nlp.backend import *
batch_size = 16
maxlen = 280
epochs = 10
lr = 1e-5
# bert配置
config_path = '/home/mingming.xu/pretrain/NLP/nezha_base_wwm/bert_config.json'
checkpoint_path = '/home/mingming.xu/pretrain/NLP/nezha_base_wwm/model.ckpt'
dict_path = '/home/mingming.xu/pretrain/NLP/nezha_base_wwm//vocab.txt'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
path = '/home/mingming.xu/datasets/NLP/ccf_qa_match/'
def load_data(train_test='train'):
D = {}
with open(os.path.join(path, train_test, train_test + '.query.tsv')) as f:
for l in f:
span = l.strip().split('\t')
D[span[0]] = {'query': span[1], 'reply': []}
with open(os.path.join(path, train_test, train_test + '.reply.tsv')) as f:
for l in f:
span = l.strip().split('\t')
if len(span) == 4:
q_id, r_id, r, label = span
else:
label = None
q_id, r_id, r = span
D[q_id]['reply'].append([r_id, r, label])
d = []
for k, v in D.items():
q_id = k
q = v['query']
reply = v['reply']
for r in reply:
r_id, rc, label = r
d.append([q_id, q, r_id, rc, label])
return d
train_data = load_data('train')
test_data = load_data('test')
class data_generator(DataGenerator):
def __iter__(self, shuffle=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, (q_id, q, r_id, r, label) in self.get_sample(shuffle):
label = int(label) if label is not None else None
token_ids, segment_ids = tokenizer.encode(q, r, maxlen=256)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
if is_end or len(batch_token_ids) == self.batch_size:
batch_token_ids = pad_sequences(batch_token_ids)
batch_segment_ids = pad_sequences(batch_segment_ids)
batch_labels = pad_sequences(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
# shuffle
np.random.shuffle(train_data)
n = int(len(train_data) * 0.8)
train_generator = data_generator(train_data[:n], batch_size)
valid_generator = data_generator(train_data[n:], batch_size)
test_generator = data_generator(test_data, batch_size)
# 加载预训练模型
bert = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
# model='bert', # 加载bert/Roberta/ernie
# model='electra', # 加载electra
model='nezha', # 加载NEZHA
return_keras_model=False,
)
inputs = bert.inputs
outputs = []
x = bert.apply_embeddings(inputs)
for idx in range(bert.num_hidden_layers):
x = bert.apply_transformer_layers(x, idx)
output = Lambda(lambda x: x[:, 0:1])(x)
outputs.append(output)
output = Concatenate(1)(outputs)
output = DGCNN(dilation_rate=1, dropout_rate=0.1)(output)
output = DGCNN(dilation_rate=2, dropout_rate=0.1)(output)
output = DGCNN(dilation_rate=2, dropout_rate=0.1)(output)
output = DGCNN(dilation_rate=1, dropout_rate=0.1)(output)
output = AttentionPooling1D()(output)
output = Dropout(0.5)(output)
output = Dense(1, activation='sigmoid')(output)
model = keras.models.Model(inputs, output)
model.summary()
model.compile(
# loss=binary_focal_loss(0.25, 12), # focal loss
loss=K.binary_crossentropy,
optimizer=Adam(2e-5),
metrics=['accuracy'],
)
def evaluate(data):
P, R, TP = 0., 0., 0.
for x_true, y_true in tqdm(data):
y_pred = model.predict(x_true)[:, 0]
y_pred = np.round(y_pred)
y_true = y_true[:, 0]
R += y_pred.sum()
P += y_true.sum()
TP += ((y_pred + y_true) > 1).sum()
print(P, R, TP)
pre = TP / R
rec = TP / P
return 2 * (pre * rec) / (pre + rec)
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, epoch, logs=None):
val_f1 = evaluate(valid_generator)
if val_f1 > self.best_val_f1:
self.best_val_f1 = val_f1
model.save_weights('best_concat_model.weights')
print(
u'val_f1: %.5f, best_val_f1: %.5f\n' %
(val_f1, self.best_val_f1)
)
def predict_to_file(path='concat_submission.tsv', data=test_generator):
preds = []
for x, _ in tqdm(test_generator):
pred = model.predict(x)[:, 0]
pred = np.round(pred)
pred = pred.astype(int)
preds.extend(pred)
ret = []
for d, p in zip(test_data, preds):
q_id, _, r_id, _, _ = d
ret.append([str(q_id), str(r_id), str(p)])
with open(path, 'w', encoding='utf8') as f:
for l in ret:
f.write('\t'.join(l) + '\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit_generator(
train_generator.generator(),
steps_per_epoch=len(train_generator),
epochs=5,
callbacks=[evaluator],
)
# predict test and write to file
model.load_weights('best_concat_model.weights')
predict_to_file()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。