加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
loader.py 5.53 KB
一键复制 编辑 原始数据 按行查看 历史
Jinchengxin 提交于 2020-05-16 13:21 . 完成第四次训练
#encoding:utf-8
from collections import Counter
import tensorflow.contrib.keras as kr
import numpy as np
import codecs
import re
import jieba
def read_file(filename):
"""
Args:
filename:trian_filename,test_filename,val_filename
Returns:
two list where the first is lables and the second is contents cut by jieba
"""
re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9]+)") # the method of cutting text by punctuation
contents,labels=[],[]
with codecs.open(filename,'r',encoding='utf-8') as f:
for line in f:
try:
line=line.rstrip()
assert len(line.split('\t'))==2
label,content=line.split('\t')
labels.append(label)
blocks = re_han.split(content)
word = []
for blk in blocks:
if re_han.match(blk):
for w in jieba.cut(blk):
#if len(w)>=2:
word.append(w)
contents.append(word)
except:
pass
return labels,contents
def build_vocab(filenames,vocab_dir,vocab_size=23051):
"""
Args:
filename:trian_filename,test_filename,val_filename
vocab_dir:path of vocab_filename
vocab_size:number of vocabulary
Returns:
writting vocab to vocab_filename
"""
all_data = []
for filename in filenames:
_,data_train=read_file(filename)
for content in data_train:
all_data.extend(content)
counter=Counter(all_data)
count_pairs=counter.most_common(vocab_size-1)
words,_=list(zip(*count_pairs))
words=['<PAD>']+list(words)
with codecs.open(vocab_dir,'w',encoding='utf-8') as f:
f.write('\n'.join(words)+'\n')
def read_vocab(vocab_dir):
"""
Args:
filename:path of vocab_filename
Returns:
words: a list of vocab
word_to_id: a dict of word to id
"""
words=codecs.open(vocab_dir,'r',encoding='utf-8').read().strip().split('\n')
word_to_id=dict(zip(words,range(len(words))))
return words,word_to_id
def read_category():
"""
Args:
None
Returns:
categories: a list of label
cat_to_id: a dict of label to id
"""
#categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
categories = ['儿科','耳鼻咽喉科', '风湿免疫科', '妇产科','肝胆外科','感染科 传染科',
'肛肠外科', '骨科', '呼吸内科', '急诊科', '精神心理科',
'口腔科','泌尿外科','内分泌科','皮肤科','普通内科','普外科','乳腺外科','烧伤科',
'神经内科','神经外科','疼痛科 麻醉科','头颈外科',
'消化内科','心血管内科','性病科','胸外科','血液科','眼科','疫苗科','影像检验科','整形科','中医科','肿瘤科'
]
print("种类",len(categories))
cat_to_id=dict(zip(categories,range(len(categories))))
return categories,cat_to_id
def process_file(filename,word_to_id,cat_to_id,max_length=600):
"""
Args:
filename:train_filename or test_filename or val_filename
word_to_id:get from def read_vocab()
cat_to_id:get from def read_category()
max_length:allow max length of sentence
Returns:
x_pad: sequence data from preprocessing sentence
y_pad: sequence data from preprocessing label
"""
labels,contents=read_file(filename)
data_id,label_id=[],[]
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
label_id.append(cat_to_id[labels[i]])
x_pad=kr.preprocessing.sequence.pad_sequences(data_id,max_length,padding='post', truncating='post')
y_pad=kr.utils.to_categorical(label_id)
return x_pad,y_pad
def batch_iter(x,y,batch_size=64):
"""
Args:
x: x_pad get from def process_file()
y:y_pad get from def process_file()
Yield:
input_x,input_y by batch size
"""
data_len=len(x)
num_batch=int((data_len-1)/batch_size)+1
indices=np.random.permutation(np.arange(data_len))
x_shuffle=x[indices]
y_shuffle=y[indices]
for i in range(num_batch):
start_id=i*batch_size
end_id=min((i+1)*batch_size,data_len)
yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]
def export_word2vec_vectors(vocab, word2vec_dir,trimmed_filename):
"""
Args:
vocab: word_to_id
word2vec_dir:file path of have trained word vector by word2vec
trimmed_filename:file path of changing word_vector to numpy file
Returns:
save vocab_vector to numpy file
"""
file_r = codecs.open(word2vec_dir, 'r', encoding='utf-8')
line = file_r.readline()
voc_size, vec_dim = map(int, line.split(' '))
embeddings = np.zeros([len(vocab), vec_dim])
line = file_r.readline()
while line:
try:
items = line.split(' ')
word = items[0]
vec = np.asarray(items[1:], dtype='float32')
if word in vocab:
word_idx = vocab[word]
embeddings[word_idx] = np.asarray(vec)
except:
pass
line = file_r.readline()
np.savez_compressed(trimmed_filename, embeddings=embeddings)
def get_training_word2vec_vectors(filename):
"""
Args:
filename:numpy file
Returns:
data["embeddings"]: a matrix of vocab vector
"""
with np.load(filename) as data:
return data["embeddings"]
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化