加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
douBan.py 9.54 KB
一键复制 编辑 原始数据 按行查看 历史
rookie 提交于 2024-02-27 03:07 . 本次的提交信息
import json
import random
import re
import requests
import csv
import os
import pandas as pd
from lxml import etree
from pymysql import *
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://root:root@localhost:3306/films')
class spider(object):
# 初始化函数
def __init__(self):
self.spiderUrl = 'https://movie.douban.com/j/new_search_subjects?'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 }',
'Cookie': 'll="118164"; bid=7tHUIKhsK4M; _pk_id.100001.4cf6=7bd736811e135f5c.1705812312.; _vwo_uuid_v2=DD6D755865CD1D0F886F3148CE46D737E|90fb759b447bcf027a4009fbf71a01bb; __yadk_uid=VGXBXo2zRl0gCMxZW9L3hHZ4FRI7DC5F; dbcl2="277829643:rf9Cbrw2o/o"; ck=mZyo; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1706435468%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.1693144747.1705812312.1706428443.1706435468.10; __utmb=30149280.0.10.1706435468; __utmc=30149280; __utmz=30149280.1706435468.10.7.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.546008517.1705812312.1706428443.1706435468.10; __utmb=223695111.0.10.1706435468; __utmc=223695111; __utmz=223695111.1706435468.10.7.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; push_noty_num=0; push_doumail_num=0; frodotk_db="d8212fc6cc88ed4688638f54659ac7d2"'}
if not os.path.exists('./spiderPage.txt'):
with open('./spiderPage.txt', 'w', encoding='utf-8') as f:
f.write('0\r')
def init(self):
if not os.path.exists('./tempData.csv'):
with open('./tempData.csv', 'w', newline='') as write_f:
write = csv.writer(write_f)
write.writerow(['directors', 'rate', 'title', 'casts', 'cover',
'detailLink', 'year', 'types', 'country', 'lang',
'time', 'movieTime', 'comment_len', 'starts', 'summary',
'comments', 'imgList', 'movieUrl'])
try:
conn = connect(host='localhost', user='root', passwd='root', database='films', port=3306, charset='utf8mb4')
sql = '''
CREATE TABLE `movie`(
`id` INT NOT NULL AUTO_INCREMENT,
`directors` VARCHAR(255) NOT NULL DEFAULT '0',
`rate` VARCHAR(255) NOT NULL DEFAULT '0',
`title` VARCHAR(255) NOT NULL DEFAULT '0',
`casts` VARCHAR(255) NOT NULL DEFAULT '0',
`cover` VARCHAR(255) NOT NULL DEFAULT '0',
`detailLink` VARCHAR(255) NOT NULL DEFAULT '0',
`year` VARCHAR(255) NOT NULL DEFAULT '0',
`types` VARCHAR(255) NOT NULL DEFAULT '0',
`country` VARCHAR(255) NOT NULL DEFAULT '0',
`lang` VARCHAR(255) NOT NULL DEFAULT '0',
`time` VARCHAR(255) NOT NULL DEFAULT '0',
`movieTime` VARCHAR(255) NOT NULL DEFAULT '0',
`comment_len` VARCHAR(255) NOT NULL DEFAULT '0',
`starts` VARCHAR(255) NOT NULL DEFAULT '0',
`summary` VARCHAR(2555) NOT NULL DEFAULT '0',
`comments` text DEFAULT NULL,
`imgList` VARCHAR(2555) NOT NULL DEFAULT '0',
`movieUrl` VARCHAR(255) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`))
COLLATE='utf8_general_ci';
'''
cursors = conn.cursor()
cursors.execute(sql)
conn.commit()
except Exception as e:
print("An error occurred:", e)
def get_page(self):
with open('./spiderPage.txt', 'r') as r_f:
return r_f.readlines()[-1].strip()
def set_page(selfs, newPage):
with open('./spiderPage.txt', 'a') as w_f:
w_f.write(str(newPage) + '\n')
def spiderMain(self):
page = self.get_page()
params = {
'start': int(page)*20
}
print('正在爬取第'+format(page)+'页')
respJson = requests.get(self.spiderUrl, headers=self.headers, params=params).json()
respJson = respJson['data']
try:
for index, movieData in enumerate(respJson):
print("正在爬取%d条" % index)
resultData = []
# 电影导演(directors)
resultData.append(','.join(movieData['directors']))
# 电影评分(rate)
resultData.append(movieData['rate'])
# 电影名字(title)
resultData.append(movieData['title'])
# 电影演员(casts)
resultData.append(','.join(movieData['casts']))
# 电影封面(cover)
resultData.append(movieData['cover'])
# 电影详情链接(detailLink)
resultData.append(movieData['url'])
respDetailHTML = requests.get(movieData['url'], headers=self.headers)
respDetailHTMLXpath = etree.HTML(respDetailHTML.text)
# 电影年份(year)
year = re.search('\d+', respDetailHTMLXpath.xpath('//*[@id="content"]/h1/span[2]/text()')[0]).group()
resultData.append(year)
# 电影类型(types)
type = []
for i in respDetailHTMLXpath.xpath('//div[@id="info"]/span[@property="v:genre"]'):
type.append(i.text)
resultData.append('.'.join(type))
# 电影制片国家(country)
textInfo = respDetailHTMLXpath.xpath('//div[@id="info"]/text()')
texts = []
for i in textInfo:
if i.strip() and not i.strip() == '/':
texts.append(i)
resultData.append(','.join(texts[0].split(sep='/')))
# 语言(lang)
resultData.append(','.join(texts[1].split(sep='/')))
# 上映时间(times)
resultData.append(respDetailHTMLXpath.xpath('//div[@id="info"]/span[@property="v:initialReleaseDate"]/@content')[0][:10])
# 电影片场(movieTime)
try:
resultData.append(respDetailHTMLXpath.xpath('//div[@id="info"]/span[@property="v:runtime"]/@content'))
except:
try:
resultData.append(re.search('\d+',texts[4]))
except:
resultData.append(random.randint(31, 69))
# 短评个数(comment_len)
resultData.append(re.search('\d+',respDetailHTMLXpath.xpath('//div[@id="comments-section"]/div[@class="mod-hd"][1]/h2//a/text()')[0]))
# 电影星级占比(starts)
starts = []
for i in respDetailHTMLXpath.xpath('//div[@id="interest_sectl"]//div[@class="ratings-on-weight"]/div[@class="item"]'):
starts.append(i.xpath('./span[@class="rating_per"]/text()')[0])
resultData.append(','.join(starts))
# 电影简介(summary)
resultData.append(respDetailHTMLXpath.xpath('//span[@property="v:summary"]/text()')[0].strip())
# 电影短评(comments)
comments = []
commentsList = respDetailHTMLXpath.xpath('//div[@id="hot-comments"]/div')
for i in commentsList:
user = i.xpath('.//h3/span[@class="comment-info"]/a/text()')[0]
start = re.search('\d+',i.xpath('.//h3/span[@class="comment-info"]/span[2]/@class')[0]).group()
times = i.xpath('.//h3/span[@class="comment-info"]/span[3]/@title')[0]
content = i.xpath('.//p[@class=" comment-content"]/span/text()')[0]
comments.append({
'user': user,
'start': start,
'time': times,
'content': content
})
resultData.append(json.dumps(comments))
# 图片列表(imgList)
resultData.append(','.join(respDetailHTMLXpath.xpath('//ul[contains(@class, "related-pic-bd")]//img/@src')))
# 电影预告片链接
try:
movieUrl = respDetailHTMLXpath.xpath('//ul[@class="related-pic-bd "]/li[@class="label-trailer"]/a/@href')[0]
movieHTML = requests.get(movieUrl, headers=self.headers)
movieHTMLXpath = etree.HTML(movieHTML.text)
resultData.append(movieHTMLXpath.xpath('//video/source/@src')[0])
except:
resultData.append(0)
self.save_to_csv(resultData)
except:
pass
self.set_page(int(page) + 1)
self.clear_csv()
self.spiderMain()
def save_to_csv(self, rowDate):
with open('./tempData.csv', 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(rowDate)
def clear_csv(self):
df = pd.read_csv('./tempData.csv')
df.dropna(inplace=True)
df.drop_duplicates()
self.save_to_sql(df)
def save_to_sql(self, df):
df = pd.read_csv('./tempData.csv')
df.to_sql('movie', con=engine, if_exists='replace')
if __name__ == '__main__':
spiderObj = spider()
spiderObj.init()
spiderObj.spiderMain()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化