加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
main.py 2.81 KB
一键复制 编辑 原始数据 按行查看 历史
iverson yao 提交于 2024-01-16 05:49 . 主程序和数据集
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
import re
from gensim.models import KeyedVectors
from sklearn.svm import SVC
# 读取数据并进行数据处理
train_data = pd.read_csv(r'C:\Users\iverson\Desktop\train.news.csv')
test_data = pd.read_csv(r'C:\Users\iverson\Desktop\test.feature.csv')
train_data.drop_duplicates(inplace=True)
train_data.fillna('', inplace=True)
titles_train = train_data['Title']
pattern = r'[^\w\s]'
titles_train = titles_train.apply(lambda x: re.sub(pattern, '', x))
titles_train = titles_train.apply(lambda x: re.sub(r' ', '', x))
stop_words = set()
with open(r'C:\Users\iverson\Desktop\stopwords.txt', 'r', encoding='utf-8') as f:
for line in f:
stop_words.add(line.strip())
titles_train = titles_train.apply(lambda x: ' '.join([word for word in jieba.cut(x) if word not in stop_words]))
labels = train_data['label']
# 特征提取
word2vec_model = KeyedVectors.load_word2vec_format(r'C:\Users\iverson\Desktop\学习\sgns.wiki.bigram-char', binary=False, encoding='utf-8')
# 将训练集和验证集中的标题文本转换为词向量表示
def get_word2vec_features(titles, word2vec_model):
features = []
for title in titles:
title_vectors = [word2vec_model[word] for word in title.split() if word in word2vec_model]
if title_vectors:
title_avg_vector = np.mean(title_vectors, axis=0)
features.append(title_avg_vector)
else:
features.append(np.zeros(word2vec_model.vector_size)) #如果标题中的所有词都不在词向量模型中,使用零向量
return np.array(features)
word2vec_features_train = get_word2vec_features(titles_train, word2vec_model)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(titles_train)
# 将TF-IDF向量和Word2Vec特征拼接在一起
X_train_combined = np.hstack((X_train.toarray(), word2vec_features_train))
# 训练模型
model_combined = SVC(C=5, gamma=0.05, kernel='linear')
model_combined.fit(X_train_combined, labels)
# 在测试集上进行预测
titles_test = test_data['Title']
titles_test = titles_test.apply(lambda x: re.sub(pattern, '', x))
titles_test = titles_test.apply(lambda x: re.sub(r' ', '', x))
titles_test = titles_test.apply(lambda x: ' '.join([word for word in jieba.cut(x) if word not in stop_words]))
word2vec_features_test = get_word2vec_features(titles_test, word2vec_model)
X_test = vectorizer.transform(titles_test)
X_test_combined = np.hstack((X_test.toarray(), word2vec_features_test))
y_pred_test = model_combined.predict(X_test_combined)
# 将预测结果保存到文件
result_df = pd.DataFrame({'id': test_data['id'], 'label': y_pred_test})
result_df.to_csv(r'C:\Users\iverson\Desktop\result.csv', index=False)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化