代码拉取完成,页面将自动刷新
# -*- coding: utf-8 -*-
"""
@Datetime: 2019/3/28
@Author: Zhang Yafei
"""
# 余弦计算相似度度量 http://blog.csdn.net/u012160689/article/details/15341303
import functools
import math
import re
import time
from nltk import FreqDist
text1 = "This game is one of the very best. games ive played. the ;pictures? " \
"cant descripe the real graphics in the game."
text2 = "this game have/ is3 one of the very best. games ive played. the ;pictures? " \
"cant descriPe now the real graphics in the game."
text3 = "So in the picture i saw a nice size detailed metal puzzle. Eager to try since I enjoy 3d wood puzzles, i ordered it. Well to my disappointment I got in the mail a small square about 4 inches around. And to add more disappointment when I built it it was smaller than the palm of my hand. For the price it should of been much much larger. Don't be fooled. It's only worth $5.00.Update 4/15/2013I have bought and completed 13 of these MODELS from A.C. Moore for $5.99 a piece, so i stand by my comment that thiss one is overpriced. It was still fun to build just like all the others from the maker of this brand.Just be warned, They are small."
text4 = "I love it when an author can bring you into their made up world and make you feel like a friend, confidant, or family. Having a special child of my own I could relate to the teacher and her madcap class. I've also spent time in similar classrooms and enjoyed the uniqueness of each and every child. Her story drew me into their world and had me laughing so hard my family thought I had lost my mind, so I shared the passage so they could laugh with me. Read this book if you enjoy a book with strong women, you won't regret it."
def timeit(func):
@functools.wraps(func)
def wrap(*args, **kwargs):
start = time.time()
res = func(*args, **kwargs)
print('运行时间为: {0:.4f}' .format(time.time() - start))
return res
return wrap
def preprocess(text):
"""
文本预处理,可根据具体情况书写逻辑
:param text:
:return:
"""
return text.split()
@timeit
def compute_cosine(words1, words2):
"""
计算两段文本的余弦相似度
:param text_a:
:param text_b:
:return:
"""
# 1. 统计词频
words1_dict = {}
words2_dict = {}
for word in words1:
# word = word.strip(",.?!;")
word = re.sub('[^a-zA-Z]', '', word)
word = word.lower()
# print(word)
if word != '' and word in words1_dict:
num = words1_dict[word]
words1_dict[word] = num + 1
elif word != '':
words1_dict[word] = 1
else:
continue
for word in words2:
# word = word.strip(",.?!;")
word = re.sub('[^a-zA-Z]', '', word)
word = word.lower()
if word != '' and word in words2_dict:
num = words2_dict[word]
words2_dict[word] = num + 1
elif word != '':
words2_dict[word] = 1
else:
continue
# nltk统计词频
# 方式一
# words1_dict2 = FreqDist(words1)
# 方式二
# from collections import Counter
# Counter(text1.split())
# print(words1_dict)
# print(words2_dict)
# 2. 按照频率排序
dic1 = sorted(words1_dict.items(), key=lambda x: x[1], reverse=True)
dic2 = sorted(words2_dict.items(), key=lambda x: x[1], reverse=True)
# print(dic1)
# print(dic2)
# 3. 得到词向量
words_key = []
list(map(lambda x: words_key.append(x[0]), dic1))
list(map(lambda x: words_key.append(x[0]), filter(lambda x: x[0] not in words_key, dic2)))
# print(words_key)
vect1 = []
vect2 = []
for word in words_key:
if word in words1_dict:
vect1.append(words1_dict[word])
else:
vect1.append(0)
if word in words2_dict:
vect2.append(words2_dict[word])
else:
vect2.append(0)
# print(vect1)
# print(vect2)
# 4. 计算余弦相似度
sum = 0
sq1 = 0
sq2 = 0
for i in range(len(vect1)):
sum += vect1[i] * vect2[i]
sq1 += pow(vect1[i], 2)
sq2 += pow(vect2[i], 2)
try:
result = round(float(sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 2)
except ZeroDivisionError:
result = 0.0
# skleran实现
import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity
# user_tag_matric = np.matrix(np.array([vect1, vect2]))
# user_similarity = cosine_similarity(user_tag_matric)
# print(user_similarity)
return result
def cosin_distance(vector1, vector2):
"""
K(X, Y) = <X, Y> / (||X||*||Y||)
:param vector1:
:param vector2:
:return:
"""
dot_product = 0.0
normA = 0.0
normB = 0.0
for a, b in zip(vector1, vector2):
dot_product += a * b
normA += a ** 2
normB += b ** 2
if normA == 0.0 or normB == 0.0:
return None
else:
return dot_product / ((normA * normB) ** 0.5)
if __name__ == '__main__':
text1 = preprocess(text1)
text2 = preprocess(text2)
print(compute_cosine(text1, text2))
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。