代码拉取完成,页面将自动刷新
import requests
from bs4 import BeautifulSoup
import pymysql
# 定义要解析的网页 URL
url = 'https://wandou.la/hot/movie'
# 发送 GET 请求获取网页内容
response = requests.get(url)
# 检查请求是否成功
if response.status_code == 200:
# 解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')
# 提取网页中的特定元素或信息
title = soup.title.string # 提取网页标题
paragraphs = soup.find_all('p', attrs={"class":"name"}) # 找到所有 <p> 元素 class="name"
# for paragraph in paragraphs:
# print(paragraph.get_text()) # 打印每个段落的文本内容
links = soup.find_all('img', attrs={"class": "preview-img"}) # 获取电影主图
# 获取豆瓣评分
scores = soup.find_all('div', attrs={"class": "c-hDImTG"})
# 获取主角名字
actors = soup.find_all('p', attrs={"class": "names"}) # 标签
# 获取电影详情页面链接
details = soup.find_all('a', attrs={"class": "c-ffeBOn"})
conn = pymysql.connect(host='192.168.18.13', port=33046, user='root', password='123456', db='xuweijie',
charset='utf8')
# 创建游标
cursor = conn.cursor()
detail_links = []
# 遍历详情页链接
for detail in details:
d_link = "https://wandou.la" + detail['href']
detail_links.append(d_link)
print(d_link)
scores_cln = [] # 保存处理后的评分
# 遍历豆瓣评分
for score in scores:
result = score.get_text().replace("豆", "").strip()
scores_cln.append(result)
for index, paragraph in enumerate(paragraphs):
# 执行 SQL 语句
sql = "INSERT INTO movies (name, pic, score, actor, detail_link) VALUES (%s, %s, %s, %s, %s)"
link = links[index]
score = scores_cln[index]
actor = actors[index] # 这是一个标签
detail_lk = detail_links[index]
val = (paragraph.get_text(), link["src"], score, actor.get_text(), detail_lk)
cursor.execute(sql, val)
# for index, link in enumerate(links):
# print(link["src"]) # 获取img标签的属性
#
# # 执行 SQL 语句
#
# val = (link["src"], paragraphs[index].get_text())
# print(val)
# sql = "update movies set pic='%s' where name='%s'" % val
# print(sql)
# cursor.execute(sql)
# 提交更改
conn.commit()
# 关闭游标和连接
cursor.close()
conn.close()
else:
print("请求失败,状态码:", response.status_code)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。