代码拉取完成,页面将自动刷新
import csv
from re import findall
from time import sleep
import requests
from fake_useragent import UserAgent
import openpyxl
def get_html(url):
'''
:param url:要爬取的地址
:return:返回html
'''
headers = {"User-Agent": UserAgent().chrome}
print(url)
resp = requests.get(url, headers=headers)
sleep(2)
if resp.status_code == 200:
resp.encoding = 'utf-8'
return resp.text
else:
return None
def pares_index(html):
'''
:param html:传递捡来一个有电影信息的html
:return: 已经提取好的电影信息
'''
# name = findall(r'"name":"(.*?)"', html) # 电影名称
# filmtime = findall(r'"premiereDate":"(.*?)"', html) # 上映时间
# avgPrice = findall(r'"avgPrice":"(.*?)"', html) # 平均票价
# avgSalesCount = findall(r'"avgSalesCount":(\d+)', html) # 场均人次
# salesInWan = findall(r'"salesInWan":"(.*?)"', html) # 总票价
# return {"name": name, "filmtime": filmtime, "avgPrice": avgPrice, "avgSalesCount": avgSalesCount, "salesInWan": salesInWan}
movieCode = findall(r'"movieCode":"(.*?)"', html) # 电影编号
return movieCode
def pares_index_detail(list_html_detail):
'''
:param list_html_detail: 获取详情页
:return: 返回所要数据
'''
name = findall(r'<div class="movie-info-content"><div class="movie-name">(.*?)</div>', list_html_detail) # 电影名称
wedPrice_test = findall( r'<div class="data-name">首周票房</div><div class="data-value">(\d+\.\d+|\d+)<span class="data-unit">(.*?)</span>',
list_html_detail) # 首周票房
wedPrice_num = [a for a, _ in wedPrice_test]
wedPrice_dan = [a for _, a in wedPrice_test]
wedPrice = wedPrice_num + wedPrice_dan
dayPrice_test = findall(r'<div class="data-name">首日票房</div><div class="data-value">(\d+\.\d+|\d+)<span class="data-unit">(.*?)</span>',
list_html_detail) # 首日票房
dayPrice_num = [a for a, _ in dayPrice_test]
dayPrice_dan = [a for _, a in dayPrice_test]
dayPrice = dayPrice_num + dayPrice_dan
score = findall(r'<div class="data-name">豆瓣评分</div><div class="data-value">(.*?)</div>',
list_html_detail) # 豆瓣评分
direct = findall(r'<p class="name">(.*?)</p><p class="desc">导演</p>',
list_html_detail) # 导演
actors_a = findall(r'<p class="name">(.*?)</p><p class="desc">(.*?)</p>',
list_html_detail) # 演员名单
actors = format_data(actors_a)
return {"name": name, "wedPrice": wedPrice, "dayPrice": dayPrice, "score": score, "direct": direct, "actors": actors}
def format_data(actors):
actor_list = []
for actor in actors:
if actor[1] != "导演":
actor_list.append(actor[0])
# actor_list = [actors[0] for actor_a in actor if actor[1] != '导演']
return actor_list
def xlsx_data(data):
# 打开一个工作簿
workbook = openpyxl.load_workbook("filmsdata_1_1.xlsx")
# 获取活动的工作表
worksheet = workbook.active
# 定义数据
data = [
[','.join(data['name'][:]), ','.join(data['wedPrice'][:]), ','.join(data['dayPrice'][:]), ','.join(data['score'][:]), ','.join(data['direct'][:]), ','.join(data['actors'][:])]
]
for row in data:
worksheet.append(row)
# for i in range(len(data['name'])):
# data_1 = [data['name'][i], data['filmtime'][i], data['avgPrice'][i], data['avgSalesCount'][i], data['salesInWan'][i]]
# # return {"name": name, "filmtime": filmtime, "avgPrice": avgPrice, "avgSalesCount": avgSalesCount, "salesInWan": salesInWan}
# worksheet.append(data_1)
#
# # 保存工作簿为xlsx文件
workbook.save("filmsdata_1_1.xlsx")
def main():
num = int(input("请输入要获取多少年:"))
for page in range(num):
year = 2022 - page
url = 'https://zgdypf.zgdypw.cn/getYearData?year={}'.format(year)
list_html = get_html(url)
movie = pares_index(list_html)
# xlsx_data(movie)
print(movie)
for i in range(len(movie)):
url = f'https://zgdypf.zgdypw.cn/movie/detail/' + movie[i] + f'?from=movie&tab=1&secondTab={year}'
list_html_detail = get_html(url)
detail = pares_index_detail(list_html_detail)
xlsx_data(detail)
if __name__ == '__main__':
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。