main

分支 (2)

管理

管理

main

master

python-project
/
novel_qidian.py

#!/usr/bin/python3
# -*- coding: UTF-8 -*-

# Python3 标准库
# https://docs.python.org/zh-cn/3/library/index.html

# 解析HTML
#1: 正则表达式大法
#2: requests-html pip install requests-html
#3: BeautifulSoup pip install beautifulsoup4
#4: lxml.XPath    pip install lxml
#[](https://imgconvert.csdnimg.cn/aHR0cHM6Ly9pbWcxLnR1aWNvb2wuY29tL25peUlSYkouanBnIXdlYg?x-oss-process=image/format,png)
#5: SGMLParser
#6: HTMLParaer

# version: 3.11.4
# 执行
# 创建空白__init__.py


import requests
import os
import sys
import time
import datetime
import html
import random
import math
import re
import json
import urllib
import shutil
import glob
import zlib
import doctest
import hashlib
from pathlib import Path
from io import StringIO
from bs4 import BeautifulSoup

def getDocumentFromUrl(url):
    request = urllib.request.Request(url)
    # request.add_header('Host','www.biququ.la')
    # request.add_header('Referer','https://www.biququ.la/html/27744/')
    request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0')
    response = urllib.request.urlopen(request)
    response.encoding = 'utf-8'
    if response.code == 200:
        pass
    else:
        print("访问%s失败，返回码为:%s", url, response.code)
        pass

    # utf-8解码，得到中文
    htmlString = response.read().decode('utf-8')
    htmlString = html.unescape(htmlString)
    document = BeautifulSoup(htmlString, 'html.parser')
    return document;
pass

def parseBiququ(document):
    chapter = ""
    title = document.body.select("div.bookname h1")[0].text
    chapter += title
    chapter += "\n"
    contentList = document.body.select("div#content p")
    for content in contentList:
        chapter += content.text
        chapter += "\n"
        pass
    chapter += "\n"
    print(chapter)
    return chapter
pass

def write2File(chapter, file):
    fo = open(file, mode="a+", encoding = "utf8")
    fo.write( chapter )
    fo.close
pass

def webcrawler():
    file = "星辰变.txt"
    for i in range(587, 681):
        print(i)
        # urlFormat = "https://www.biququ.la/html/27744/34680%d.html"
        # url = urlFormat % i
        # document = getDocumentFromUrl(url);
        # chapter = parseBiququ(document)
        # write2File(chapter, file)
        pass
    print("finish")
pass


webcrawler()

# biququ
#start = "https://www.biququ.la/html/27744/346801.html"
#end = "https://www.biququ.la/html/27744/34680681.html"
# qidian
#catalogUrl = "https://book.qidian.com/info/118447/#Catalog"