加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
novel.py 10.62 KB
一键复制 编辑 原始数据 按行查看 历史
bikong7 提交于 2021-12-14 05:22 . 完善章节采集功能
from os import stat_result
from bs4.builder import TreeBuilder
from pymysql import connect
import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
import mysqloperate
import unity
import re
# 要求采集 封面图, 阅读人数 title
# 章节title
# 章节内容(每本小说的、前10章)或者免费章节
# 然后入库
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
'Upgrade-Insecure-Requests': '1',
'Host': 'book.zongheng.com',
'Cookie': 'ZHID=5494A2374863F528FD22DA1FC9F8D8F2; ver=2018; zh_visitTime=1638971051871; v_user=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D_Fvcvg2ef-VMuqMux0xRAZ8FaAqeeMjHA22m5f1YkGd5QRnuo8mkwcYaJozN5D0l%26wd%3D%26eqid%3Da57b99dc0015c4f70000000461b0b6a7%7Chttp%3A%2F%2Fwww.zongheng.com%2F%7C97436826; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2217d9a498fd8703-04893b6ee9e488-978183a-2073600-17d9a498fd9141%22%2C%22%24device_id%22%3A%2217d9a498fd8703-04893b6ee9e488-978183a-2073600-17d9a498fd9141%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; Hm_up_c202865d524849216eea846069349eb9=%7B%22uid_%22%3A%7B%22value%22%3A%225494A2374863F528FD22DA1FC9F8D8F2%22%2C%22scope%22%3A1%7D%7D; PassportCaptchaId=e66945a09c0c2a8882c2956096bba347; rSet=1_3_1_14; zhffr=book.zongheng.com; Hm_lvt_c202865d524849216eea846069349eb9=1638971052,1639004935; Hm_lpvt_c202865d524849216eea846069349eb9=1639004935; JSESSIONID=abcrF8b9LorlE5igQtC2x',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
db = mysqloperate.mysql()
tool = unity.unity()
patten1 = re.compile("(第)?[一二三四五六七八九十]章+")
patten2 = re.compile("(第)?[0-9]*?[章]+")
patten3 = re.compile("[三四五六七八九十]")
def getbookids(url):
resp = requests.get(url, headers=header)
soup = BeautifulSoup(resp.text,'html.parser')
books = soup.find_all("div", class_='bookbox fl')
ids = []
for book in books:
#name = book.select("div[class='bookname']")
name = book.find("div", class_='bookname')
aa = name.select("a")
href = aa[0].attrs["href"]
temp = href.split("/")[-1]
id = temp.split(".")[0]
ids.append(id)
return ids
def getbookinfo(bookid):
url = f'http://book.zongheng.com/book/{bookid}.html'
print(url)
resp = requests.get(url, headers=header)
html = etree.HTML(resp.text)
dict = {}
dict['novel_id'] = bookid
soup = BeautifulSoup(resp.text,'html.parser')
info = soup.find("div", class_='book-top clearfix')
name = info.select_one("div[class='book-name']")
dict['name'] = name.text.strip()
author = info.select_one("div[class='au-name']")
dict['author'] = author.text.strip()
desc = info.select_one("div[class='book-dec Jbook-dec hide']")
dict['desc'] = desc.text.strip()
img = info.select_one("img")
dict['img'] = img.attrs['src']
state = info.select_one("a[class='state']")
if state.text.strip() == '连载中':
dict['is_over'] = 1
else:
dict['is_over'] = 2
label = info.select_one("a[class='label']")
dict['nav_id'] = tool.getNavId(label.text.strip())
dict['wordcount'] = 0
dict['hits'] = 0
info = soup.find("div", class_='nums')
spans = info.select("i")
if len(spans) > 0:
dict['wordcount'] = tool.strToNum(spans[0].text.strip())
if len(spans) > 2:
dict['hits'] = tool.strToNum(spans[2].text.strip())
dict['is_tuijian'] = 1
dict['is_hot'] = 1
#print(dict)
return dict
def getnovels(urls):
bookids = []
for url in urls:
ids = getbookids(url)
bookids += ids
if len(bookids) == 600:
break
novels = []
for bookid in bookids:
dict = getbookinfo(bookid)
novels.append(dict)
time.sleep(1)
return novels
def gettitles(bookid):
url = f'http://book.zongheng.com/showchapter/{bookid}.html'
resp = requests.get(url, headers=header)
soup = BeautifulSoup(resp.text,'html.parser')
div = soup.find_all("div", class_='div class="volume vip-color"')
uls = soup.find_all("ul", class_='chapter-list clearfix')
dict = {'urls':[],'chapter':[],'titles':[]}
ul = uls[0]
if len(uls) > 1:
ul = uls[1]
for li in ul.select("li"):
title = li.get_text().strip()
href = li.select('a')[0]['href']
print(title)
ret1 = patten1.match(title)
ret2 = patten2.match(title)
ret3 = patten3.match(title)
# if ret1 == None and ret2 == None and ret3 == None:
# continue
title = title.replace(':','')
title = title.replace(':','')
title = title.replace('、','')
title = ' '.join(title.split())
sep = " "
if "." in title and title.count(".") == 1:
sep = "."
tt = title.split(sep)
if len(tt) < 2:
continue
print(bookid, title, tt)
dict['urls'].append(href)
dict['chapter'].append(tt[0])
dict['titles'].append(' '.join(tt[1:]))
# for ul in uls:
# for li in ul.select("li"):
# title = li.get_text().strip()
# href = li.select('a')[0]['href']
# print(title)
# ret1 = patten1.match(title)
# ret2 = patten2.match(title)
# #ret3 = patten3.match(title)
# if ret1 == None and ret2 == None:
# continue
# title = title.replace(':','')
# title = title.replace(':','')
# title = title.replace('、','')
# title = ' '.join(title.split())
# dict['urls'].append(href)
# sep = " "
# if "." in title:
# sep = "."
# tt = title.split(sep)
# if len(tt) < 2:
# tt = title.split('章')
# tt[0] = tt[0]+'章'
# print(bookid, title, tt)
# dict['chapter'].append(tt[0])
# dict['titles'].append(tt[1])
# num += 1
# if num == 10:
# break
# if num == 10:
# break
#print(dict)
return dict
def getcontent(url):
#<div class="content" itemprop="acticleBody">
resp = requests.get(url, headers=header)
soup = BeautifulSoup(resp.text,'html.parser')
content = soup.find("div", class_='content',itemprop="acticleBody")
ss = str(content)
ss = ss.replace("\n",'')
ss = ss.replace('<div class="content" itemprop="acticleBody">', '')
content = ss.replace('</div>','')
#print(content)
return content
def getOneBook(bookid):
infos = gettitles(bookid)
if len(infos['titles']) < 10:
print(bookid, len(infos['titles']), infos['titles'])
return
dicts = []
dicts.clear()
chapternum = 0
for url in infos['urls']:
content = ''
if chapternum < 10:
content = getcontent(url)
content = content.replace('"', '\\"')
chapter = infos['chapter'][chapternum]
title = infos['titles'][chapternum]
chapternum += 1
dict = {}
dict['novel_id'] = bookid
dict['chapter'] = chapter
dict['title'] = title
dict['chapternum'] = chapternum
dict['content'] = content
dicts.append(dict)
return dicts
def geturls():
urls = []
for i in range(1,30):
url = f'http://www.zongheng.com/store/c0/c0/b0/u0/p{i}/v9/s1/t0/u0/i1/ALL.html'
urls.append(url)
return urls
def novelsToDb(novels):
for novel in novels:
t1=time.localtime()
t2=time.strftime("%Y-%m-%d %H:%M:%S",t1)
sql = f'insert into novels values({novel["novel_id"]},{novel["nav_id"]},\"{novel["img"]}\",\
\"{novel["name"]}\",\"{novel["author"]}\",{novel["wordcount"]},{novel["hits"]},\
{novel["is_over"]},\"{novel["desc"]}\",{novel["is_tuijian"]},\
{novel["is_hot"]},0,0,\"{t2}\",\"{t2}\")'
#print(sql)
db.insert_db(sql)
def articleToDb(articles):
if articles == None:
return
maxid = getMaxArticle()
if articles == None:
print("none")
for dict in articles:
t1=time.localtime()
t2=time.strftime("%Y-%m-%d %H:%M:%S",t1)
maxid += 1
sql = f'insert into articles values({maxid}, {dict["novel_id"]},\"{dict["chapter"]}\", {dict["chapternum"]},\"{dict["title"]}\",\"{dict["content"]}\",\
1,\"{t2}\",\"{t2}\")'
#print(sql)
db.insert_db(sql)
def getnovelids_fromdb():
sql = 'select id from novels'
ret = db.query_db(sql, True)
#print(ret)
return ret
def delnovel(novelid):
sql = f'delete from novels where id={novelid}'
ret = db.insert_db(sql)
#print(ret)
return ret
def getMaxArticle():
sql = 'select max(id) from articles'
ret = db.query_db(sql, False)
id = ret[0]
if id == None:
id = 0
return id
def getarticles(bookids):
num = 0
begin = 0
for bookid in bookids:
id = bookid[0]
#设置一个起始id
if begin != 0 and begin != id:
continue
#到达起始id后,废弃起始id
begin = 0
articles = getOneBook(id)
if articles == None:
delnovel(id)
print(f'{id} is deleted')
continue
print('#####################, num:', num)
articleToDb(articles)
num += 1
if num == 500:
break
if __name__ == '__main__':
#获取小说表
# urls = geturls()
# novels = getnovels(urls)
# novelsToDb(novels)
#从小说表读取小说id
bookids = getnovelids_fromdb()
#采集小说章节内容
getarticles(bookids)
#纠错
# bookids = [1048326]
# for id in bookids:
# articles = getOneBook(id)
# if articles != None:
# articleToDb(articles)
# else:
# print(articles)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化