master

分支 (1)

管理

管理

master

novel
/
novel.py

from os import stat_result
from bs4.builder import TreeBuilder
from pymysql import connect
import requests
from lxml import  etree
from bs4 import BeautifulSoup
import time
import mysqloperate
import unity
import re

# 要求采集 封面图， 阅读人数  title
# 章节title
# 章节内容（每本小说的、前10章）或者免费章节
# 然后入库

header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
'Upgrade-Insecure-Requests': '1',
'Host': 'book.zongheng.com',
'Cookie': 'ZHID=5494A2374863F528FD22DA1FC9F8D8F2; ver=2018; zh_visitTime=1638971051871; v_user=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D_Fvcvg2ef-VMuqMux0xRAZ8FaAqeeMjHA22m5f1YkGd5QRnuo8mkwcYaJozN5D0l%26wd%3D%26eqid%3Da57b99dc0015c4f70000000461b0b6a7%7Chttp%3A%2F%2Fwww.zongheng.com%2F%7C97436826; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2217d9a498fd8703-04893b6ee9e488-978183a-2073600-17d9a498fd9141%22%2C%22%24device_id%22%3A%2217d9a498fd8703-04893b6ee9e488-978183a-2073600-17d9a498fd9141%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; Hm_up_c202865d524849216eea846069349eb9=%7B%22uid_%22%3A%7B%22value%22%3A%225494A2374863F528FD22DA1FC9F8D8F2%22%2C%22scope%22%3A1%7D%7D; PassportCaptchaId=e66945a09c0c2a8882c2956096bba347; rSet=1_3_1_14; zhffr=book.zongheng.com; Hm_lvt_c202865d524849216eea846069349eb9=1638971052,1639004935; Hm_lpvt_c202865d524849216eea846069349eb9=1639004935; JSESSIONID=abcrF8b9LorlE5igQtC2x',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}


db = mysqloperate.mysql()
tool = unity.unity()

patten1 = re.compile("(第)?[一二三四五六七八九十]章+")
patten2 = re.compile("(第)?[0-9]*?[章]+")
patten3 = re.compile("[三四五六七八九十]")

def getbookids(url):
    resp = requests.get(url, headers=header)
    soup = BeautifulSoup(resp.text,'html.parser')
    books = soup.find_all("div", class_='bookbox fl')
    ids = []
    for book in books:
        #name = book.select("div[class='bookname']")
        name = book.find("div", class_='bookname')
        aa = name.select("a")
        href = aa[0].attrs["href"]
        temp = href.split("/")[-1]
        id = temp.split(".")[0]
        ids.append(id)

    return ids

def getbookinfo(bookid):
    url = f'http://book.zongheng.com/book/{bookid}.html'
    print(url)

    resp = requests.get(url, headers=header)
    html = etree.HTML(resp.text)

    dict = {}

    dict['novel_id'] = bookid

    soup = BeautifulSoup(resp.text,'html.parser')
    info = soup.find("div", class_='book-top clearfix')


    name = info.select_one("div[class='book-name']")
    dict['name'] = name.text.strip()

    author = info.select_one("div[class='au-name']")
    dict['author'] = author.text.strip()

    desc = info.select_one("div[class='book-dec Jbook-dec hide']")
    dict['desc'] = desc.text.strip()

    img = info.select_one("img")
    dict['img'] = img.attrs['src']

    state = info.select_one("a[class='state']")
    if state.text.strip() == '连载中':
        dict['is_over'] = 1
    else:
        dict['is_over'] = 2

    label = info.select_one("a[class='label']")
    dict['nav_id'] = tool.getNavId(label.text.strip())

    dict['wordcount'] = 0
    dict['hits'] = 0
    info = soup.find("div", class_='nums')
    spans = info.select("i")
    if len(spans) > 0:
        dict['wordcount'] = tool.strToNum(spans[0].text.strip())
    if len(spans) > 2:
        dict['hits'] = tool.strToNum(spans[2].text.strip())


    dict['is_tuijian'] = 1
    dict['is_hot'] = 1

    #print(dict)
    return  dict

def getnovels(urls):
    bookids = []
    for url in urls:
        ids = getbookids(url)
        bookids += ids
        if len(bookids) == 600:
            break

    novels = []
    for bookid in bookids:
        dict = getbookinfo(bookid)
        novels.append(dict)

        time.sleep(1)

    return novels

def gettitles(bookid):
    url = f'http://book.zongheng.com/showchapter/{bookid}.html'
    resp = requests.get(url, headers=header)

    soup = BeautifulSoup(resp.text,'html.parser')
    div = soup.find_all("div", class_='div class="volume vip-color"')
    uls = soup.find_all("ul", class_='chapter-list clearfix')
    dict = {'urls':[],'chapter':[],'titles':[]}

    ul = uls[0]
    if len(uls) > 1:
        ul = uls[1]

    for li in ul.select("li"):
        title = li.get_text().strip()
        href = li.select('a')[0]['href']

        print(title)

        ret1 = patten1.match(title)
        ret2 = patten2.match(title)
        ret3 = patten3.match(title)

        # if ret1 == None and ret2 == None and ret3 == None:
        #     continue

        title = title.replace(':','')
        title = title.replace('：','')
        title = title.replace('、','')
        title = ' '.join(title.split())

        sep = " "

        if "." in title and title.count(".") == 1:
            sep = "."

        tt = title.split(sep)

        if len(tt) < 2:
            continue

        print(bookid, title, tt)

        dict['urls'].append(href)
        dict['chapter'].append(tt[0])
        dict['titles'].append(' '.join(tt[1:]))


    # for ul in uls:
    #     for li in ul.select("li"):
    #         title = li.get_text().strip()
    #         href = li.select('a')[0]['href']

    #         print(title)

    #         ret1 = patten1.match(title)
    #         ret2 = patten2.match(title)
    #         #ret3 = patten3.match(title)

    #         if ret1 == None and ret2 == None:
    #             continue

    #         title = title.replace(':','')
    #         title = title.replace('：','')
    #         title = title.replace('、','')
    #         title = ' '.join(title.split())

    #         dict['urls'].append(href)
    #         sep = " "
    #         if "." in title:
    #             sep = "."

    #         tt = title.split(sep)

    #         if len(tt) < 2:
    #             tt = title.split('章')
    #             tt[0] = tt[0]+'章'

    #         print(bookid, title, tt)

    #         dict['chapter'].append(tt[0])
    #         dict['titles'].append(tt[1])
    #         num += 1
    #         if num == 10:
    #             break

    #     if num == 10:
    #         break

    #print(dict)
    return dict

def getcontent(url):

    #<div class="content" itemprop="acticleBody">
    resp = requests.get(url, headers=header)
    soup = BeautifulSoup(resp.text,'html.parser')
    content = soup.find("div", class_='content',itemprop="acticleBody")
    ss = str(content)
    ss = ss.replace("\n",'')
    ss = ss.replace('<div class="content" itemprop="acticleBody">', '')
    content = ss.replace('</div>','')

    #print(content)
    return  content

def getOneBook(bookid):
    infos = gettitles(bookid)
    if len(infos['titles']) < 10:
        print(bookid, len(infos['titles']), infos['titles'])
        return

    dicts = []
    dicts.clear()
    chapternum = 0
    for url in infos['urls']:
        content = ''
        if chapternum < 10:
            content = getcontent(url)
            content = content.replace('"', '\\"')
        chapter = infos['chapter'][chapternum]
        title = infos['titles'][chapternum]
        chapternum += 1

        dict = {}
        dict['novel_id'] = bookid
        dict['chapter'] = chapter
        dict['title'] = title
        dict['chapternum'] = chapternum
        dict['content'] = content

        dicts.append(dict)


    return dicts

def geturls():
    urls = []
    for i in range(1,30):
        url = f'http://www.zongheng.com/store/c0/c0/b0/u0/p{i}/v9/s1/t0/u0/i1/ALL.html'
        urls.append(url)

    return urls

def novelsToDb(novels):
    for novel in novels:

        t1=time.localtime()
        t2=time.strftime("%Y-%m-%d %H:%M:%S",t1)

        sql = f'insert into novels values({novel["novel_id"]},{novel["nav_id"]},\"{novel["img"]}\",\
        \"{novel["name"]}\",\"{novel["author"]}\",{novel["wordcount"]},{novel["hits"]},\
        {novel["is_over"]},\"{novel["desc"]}\",{novel["is_tuijian"]},\
        {novel["is_hot"]},0,0,\"{t2}\",\"{t2}\")'

        #print(sql)
        db.insert_db(sql)

def articleToDb(articles):

    if articles == None:
        return

    maxid = getMaxArticle()

    if articles == None:
        print("none")

    for dict in articles:
        t1=time.localtime()
        t2=time.strftime("%Y-%m-%d %H:%M:%S",t1)

        maxid += 1
        sql = f'insert into articles values({maxid}, {dict["novel_id"]},\"{dict["chapter"]}\", {dict["chapternum"]},\"{dict["title"]}\",\"{dict["content"]}\",\
            1,\"{t2}\",\"{t2}\")'

        #print(sql)
        db.insert_db(sql)

def getnovelids_fromdb():
    sql = 'select id from novels'
    ret = db.query_db(sql, True)

    #print(ret)

    return ret

def delnovel(novelid):
    sql = f'delete from novels where id={novelid}'
    ret = db.insert_db(sql)

    #print(ret)

    return ret

def getMaxArticle():
    sql = 'select max(id) from articles'
    ret = db.query_db(sql, False)

    id = ret[0]
    if id == None:
        id = 0

    return id

def getarticles(bookids):
    num = 0
    begin = 0
    for bookid in bookids:
        id = bookid[0]

        #设置一个起始id
        if begin != 0 and begin != id:
            continue
        #到达起始id后，废弃起始id
        begin = 0

        articles = getOneBook(id)
        if articles == None:
            delnovel(id)
            print(f'{id} is deleted')
            continue

        print('#####################, num:', num)
        articleToDb(articles)
        num += 1
        if num == 500:
            break

if __name__ == '__main__':

    #获取小说表
    # urls = geturls()
    # novels = getnovels(urls)
    # novelsToDb(novels)

    #从小说表读取小说id
    bookids = getnovelids_fromdb()
    #采集小说章节内容
    getarticles(bookids)

    #纠错
    # bookids = [1048326]
    # for id in bookids:
    #     articles = getOneBook(id)
    #     if articles != None:
    #         articleToDb(articles)
    #     else:
    #         print(articles)