加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
senti-class.py 4.28 KB
一键复制 编辑 原始数据 按行查看 历史
SHRBAI 提交于 2024-05-17 15:24 . 源程序
import numpy as np
#1、数据预处理部分
max_length = 80 #设置获取的文本长度
labels = [] #用以存放label
context = [] #用以存放汉字文本
vocab = set() #创建字符表
with open("../dataset/cn/ChnSentiCorp.txt", mode="r", encoding="UTF-8") as emotion_file:
for line in emotion_file.readlines():
line = line.strip().split(",")#去除首尾空格,并以逗号为分隔符切分
# labels.append(int(line[0]))
if int(line[0]) == 0:
labels.append(0)
else:
labels.append(1)
text = "".join(line[1:])#将除标签之外的文本拼接为字符串
context.append(text)
for char in text: vocab.add(char) #创建一个不重复的字符表
voacb_list = list(sorted(vocab))#对字符排序
token_list = []
#下面的内容是对context内容根据vocab进行token处理
for text in context:
token = [voacb_list.index(char) for char in text]#将每个字符转换为vocab中的索引
token = token[:max_length] + [0] * (max_length - len(token))#将文本长度统一为max_length,不足部分补0
token_list.append(token)
#设置种子后,在打乱数据集后,文本和标签仍然是一一对应的
seed = 17
np.random.seed(seed);np.random.shuffle(token_list)
np.random.seed(seed);np.random.shuffle(labels)
#划分预测集
dev_list = np.array(token_list[:170])
dev_labels = np.array(labels[:170])
#划分训练集
token_list = np.array(token_list[170:])
labels = np.array(labels[170:])
#2、网络定义部分
import torch
class RNNModel(torch.nn.Module):
def __init__(self,vocab_size = 128):
super().__init__()
self.embedding_table = torch.nn.Embedding(vocab_size,embedding_dim=312)#词嵌入层
self.gru = torch.nn.GRU(312,256) # 注意这里输出两个,out与hidden,这里的out是序列在模型运行后全部隐藏层的状态,而hidden是最后一个隐藏层的状态
self.batch_norm = torch.nn.LayerNorm(256,256)#将输出标准化处理为均值0,标准差1
self.gru2 = torch.nn.GRU(256,128,bidirectional=True) # 双向RNN
#定义网络
def forward(self,token):
token_inputs = token
embedding = self.embedding_table(token_inputs)
gru_out,_ = self.gru(embedding)
embedding = self.batch_norm(gru_out)
out,hidden = self.gru2(embedding)
return out
#3、模型定义部分
def get_model(vocab_size = len(voacb_list),max_length = max_length):
model = torch.nn.Sequential(
RNNModel(vocab_size),#得到全部隐藏层的状态
torch.nn.Flatten(),#将输出拉直
torch.nn.Linear(2 * max_length * 128,2)#线性层,输入为:双向RNN*输入序列长度max_length*隐藏层长度128,输出为2
)
return model
#4、设置参数与训练部分
device = "cuda"
model = get_model().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
loss_func = torch.nn.CrossEntropyLoss()
batch_size = 128
train_length = len(labels)
for epoch in (range(21)):#训练20轮
train_num = train_length // batch_size
train_loss, train_correct = 0, 0
for i in (range(train_num)):
start = i * batch_size
end = (i + 1) * batch_size#每次收取一个batch_size的数据进行训练
#将数据与标签传入GPU
batch_input_ids = torch.tensor(token_list[start:end]).to(device)
batch_labels = torch.tensor(labels[start:end]).to(device)
pred = model(batch_input_ids)
#计算一批次的损失
loss = loss_func(pred, batch_labels.type(torch.uint8))
#梯度下降过程
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()
train_correct += ((torch.argmax(pred, dim=-1) == (batch_labels)).type(torch.float).sum().item() / len(batch_labels))
#输出一轮的损失与准确率
train_loss /= train_num
train_correct /= train_num
print("train_loss:", train_loss, "train_correct:", train_correct)
#输出测试集的准确率
test_pred = model(torch.tensor(dev_list).to(device))
correct = (torch.argmax(test_pred, dim=-1) == (torch.tensor(dev_labels).to(device))).type(torch.float).sum().item() / len(test_pred)
print("test_acc:",correct)
print("-------------------")
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化