代码拉取完成,页面将自动刷新
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
import re
from gensim.models import KeyedVectors
from sklearn.svm import SVC
# 读取数据并进行数据处理
train_data = pd.read_csv(r'C:\Users\iverson\Desktop\train.news.csv')
test_data = pd.read_csv(r'C:\Users\iverson\Desktop\test.feature.csv')
train_data.drop_duplicates(inplace=True)
train_data.fillna('', inplace=True)
titles_train = train_data['Title']
pattern = r'[^\w\s]'
titles_train = titles_train.apply(lambda x: re.sub(pattern, '', x))
titles_train = titles_train.apply(lambda x: re.sub(r' ', '', x))
stop_words = set()
with open(r'C:\Users\iverson\Desktop\stopwords.txt', 'r', encoding='utf-8') as f:
for line in f:
stop_words.add(line.strip())
titles_train = titles_train.apply(lambda x: ' '.join([word for word in jieba.cut(x) if word not in stop_words]))
labels = train_data['label']
# 特征提取
word2vec_model = KeyedVectors.load_word2vec_format(r'C:\Users\iverson\Desktop\学习\sgns.wiki.bigram-char', binary=False, encoding='utf-8')
# 将训练集和验证集中的标题文本转换为词向量表示
def get_word2vec_features(titles, word2vec_model):
features = []
for title in titles:
title_vectors = [word2vec_model[word] for word in title.split() if word in word2vec_model]
if title_vectors:
title_avg_vector = np.mean(title_vectors, axis=0)
features.append(title_avg_vector)
else:
features.append(np.zeros(word2vec_model.vector_size)) #如果标题中的所有词都不在词向量模型中,使用零向量
return np.array(features)
word2vec_features_train = get_word2vec_features(titles_train, word2vec_model)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(titles_train)
# 将TF-IDF向量和Word2Vec特征拼接在一起
X_train_combined = np.hstack((X_train.toarray(), word2vec_features_train))
# 训练模型
model_combined = SVC(C=5, gamma=0.05, kernel='linear')
model_combined.fit(X_train_combined, labels)
# 在测试集上进行预测
titles_test = test_data['Title']
titles_test = titles_test.apply(lambda x: re.sub(pattern, '', x))
titles_test = titles_test.apply(lambda x: re.sub(r' ', '', x))
titles_test = titles_test.apply(lambda x: ' '.join([word for word in jieba.cut(x) if word not in stop_words]))
word2vec_features_test = get_word2vec_features(titles_test, word2vec_model)
X_test = vectorizer.transform(titles_test)
X_test_combined = np.hstack((X_test.toarray(), word2vec_features_test))
y_pred_test = model_combined.predict(X_test_combined)
# 将预测结果保存到文件
result_df = pd.DataFrame({'id': test_data['id'], 'label': y_pred_test})
result_df.to_csv(r'C:\Users\iverson\Desktop\result.csv', index=False)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。