diff --git a/requirements.txt b/requirements.txt index 9619f88deea8d82f8717f76ade6dbb7b992b84d2..df3205da4a9681ee1ff5e3d696a9e76362c1f968 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ watermark ltp akshare mpld3 +xlwt diff --git a/sentiment_analysis/data/1.xls b/sentiment_analysis/data/1.xls new file mode 100644 index 0000000000000000000000000000000000000000..35af54461017ea213b2645c7ee19712df0df5ecb Binary files /dev/null and b/sentiment_analysis/data/1.xls differ diff --git a/sentiment_analysis/data_loader.py b/sentiment_analysis/data_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..c8701e79e9106b757a77f3c3760e75ee3e51543f --- /dev/null +++ b/sentiment_analysis/data_loader.py @@ -0,0 +1,25 @@ +import xlwt +import requests +from lxml import etree + +headers = { + 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5060.122 Safari/537.36 Edg/103.0.1264.49"} +text_list = [] # 用来收集评论 +time_list = [] # 用来收集时间 + +for i in range(1, 50): # 爬前五十页 + url = 'https://guba.eastmoney.com/list,000625_{}.html'.format(i) # 每次循环替换页码值 + maintree = etree.HTML(requests.get(url=url, headers=headers).text) + list1 = maintree.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr/td[3]/div/a//text()') # 用xpath得到一页中所有评论 + list2 = maintree.xpath('///*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr/td[5]/div//text()') # 用xpath得到一页中所有时间 + text_list = text_list + list1 # 合并评论列表 + time_list = time_list + list2 # 合并时间列表 + +w = xlwt.Workbook() # 写入excel表格 +s = w.add_sheet('s1') +s.write(0, 0, '评论') +s.write(0, 1, '时间') +for i in range(0, len(text_list)): + s.write(i + 1, 0, text_list[i]) + s.write(i + 1, 1, time_list[i]) +w.save('data/1.xls') # 存储到本地