加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
reptile_maoyan_fail.py 4.00 KB
一键复制 编辑 原始数据 按行查看 历史
rookie 提交于 2024-02-27 03:07 . 本次的提交信息
import hashlib
import random
import time
import uuid
from fake_useragent import UserAgent
import requests
from re import findall
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
def get_html(url):
'''
:param url:要爬取的地址
:return:返回html
'''
headers = {"User-Agent": UserAgent().chrome,
"Cookie": "__mta=207860269.1704201302145.1704213737093.1704213880564.33;"
"uuid_n_v=v1;"
"uuid=EAC4EE70A97011EE9D005D48E25AAA2026CA184696BC4B3C99668D0A2CABF221;"
"_lxsdk_cuid=18cca502bf1c8-0ab73a0cab1c77-26001951-144000-18cca502bf1c8;"
"_csrf=393580308f4d1dbe8c7c22df65a716f6d3b2e6ebf5d081d15383e0c5f4abf77d;"
"Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1704201301,1704207964,1704208473;"
"_lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic;"
"_lxsdk=EAC4EE70A97011EE9D005D48E25AAA2026CA184696BC4B3C99668D0A2CABF221;"
"Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1704213881;"
"_lxsdk_s=18cca9eeac1-26b-1be-ff4%7C%7C81"
} # cookie避开验证码
print(url)
resp = requests.get(url, headers=headers, timeout=5)
sleep(5)
if resp.status_code == 200:
resp.encoding = 'utf-8'
return resp.text
else:
return None
def get_html_more(url):
'''
:param url:获取详情页网址
:return: 返回html
'''
driver = webdriver.Chrome()
driver.get(url)
uid = uuid.uuid4() # 随机uid
ts = time.time() * 1000 # 时间戳
key = 'A013F70DB97834C0A5492378BD76C53A' # 固定
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' # 固定
index = 1000 * random.random() + 1
enstr = f'method=GET&timeStamp={ts}&User-Agent={ua}&index={index}&channelId=40009&sVersion=2&key={key}'
signkey = hashlib.md5(enstr.encode(encoding='UTF-8')).hexdigest()
# https://www.maoyan.com/ajax/films/1320283?timeStamp={}&index={}&signKey={}&channelId=40011&sVersion=1&webdriver=false
url_1 = url + f'?timeStamp{ts}&index={index}&signKey={signkey}&channelId=40011&sVersion=1&webdriver=false'
WebDriverWait(driver, 10)
# driver.get(url_1)
#WebDriverWait(driver, 10)
#driver.get(url)
return driver.page_source
def parse_list(html):
'''
:param html: 传递进来一个有电影列表的html
:return:返回一个电影列表的url
'''
a_list = findall(r'<div class="channel-detail movie-item-title".+>\s+<a href="(/films/\d+)"', html)
list_url = ['http://www.maoyan.com{}'.format(url) for url in a_list]
return list_url
def pares_index(html):
'''
:param html:传递捡来一个有电影信息的html
:return: 已经提取好的电影信息
'''
name = findall(r'<h1 class="name">(.+)</h1>', html) # 电影名称
type = findall(r'<a class="text-link".*?>(.*?)</a>', html) # 电影类型
country = findall(r'class="ellipsis">(\s*)(\S+)', html)[1][1] # 上映国家
filmtime = findall( r'<li class="ellipsis">(\s*)(\S+)(\s*)', html)[2][1] # 上映时间
actors_a = findall(r'<div class="name">\s*(.*?)\s*</div>', html)
actors = format_data(actors_a) # 演员名单
# dirctor =
return {"name": name, "type": type, "country": country, "filmtime": filmtime, "actors": actors}
def format_data(actors):
actor_set = set()
for actor in actors:
actor_set.add(actor.strip())
return actor_set
def main():
num = int(input("请输入要获取多少页:"))
for page in range(num):
url = 'https://maoyan.com/films?showType=3&offset={}'.format(page * 30)
list_html = get_html(url)
list_url = parse_list(list_html)
for url in list_url:
info_html = get_html_more(url)
movie = pares_index(info_html)
print(movie)
if __name__ == '__main__':
main()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化