加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
novel_qidian.py 2.50 KB
一键复制 编辑 原始数据 按行查看 历史
Candyメ奶糖 提交于 2023-08-13 15:46 . python code
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
# Python3 标准库
# https://docs.python.org/zh-cn/3/library/index.html
# 解析HTML
#1: 正则表达式大法
#2: requests-html pip install requests-html
#3: BeautifulSoup pip install beautifulsoup4
#4: lxml.XPath pip install lxml
#[](https://imgconvert.csdnimg.cn/aHR0cHM6Ly9pbWcxLnR1aWNvb2wuY29tL25peUlSYkouanBnIXdlYg?x-oss-process=image/format,png)
#5: SGMLParser
#6: HTMLParaer
# version: 3.11.4
# 执行
# 创建空白__init__.py
import requests
import os
import sys
import time
import datetime
import html
import random
import math
import re
import json
import urllib
import shutil
import glob
import zlib
import doctest
import hashlib
from pathlib import Path
from io import StringIO
from bs4 import BeautifulSoup
def getDocumentFromUrl(url):
request = urllib.request.Request(url)
# request.add_header('Host','www.biququ.la')
# request.add_header('Referer','https://www.biququ.la/html/27744/')
request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0')
response = urllib.request.urlopen(request)
response.encoding = 'utf-8'
if response.code == 200:
pass
else:
print("访问%s失败,返回码为:%s", url, response.code)
pass
# utf-8解码,得到中文
htmlString = response.read().decode('utf-8')
htmlString = html.unescape(htmlString)
document = BeautifulSoup(htmlString, 'html.parser')
return document;
pass
def parseBiququ(document):
chapter = ""
title = document.body.select("div.bookname h1")[0].text
chapter += title
chapter += "\n"
contentList = document.body.select("div#content p")
for content in contentList:
chapter += content.text
chapter += "\n"
pass
chapter += "\n"
print(chapter)
return chapter
pass
def write2File(chapter, file):
fo = open(file, mode="a+", encoding = "utf8")
fo.write( chapter )
fo.close
pass
def webcrawler():
file = "星辰变.txt"
for i in range(587, 681):
print(i)
# urlFormat = "https://www.biququ.la/html/27744/34680%d.html"
# url = urlFormat % i
# document = getDocumentFromUrl(url);
# chapter = parseBiququ(document)
# write2File(chapter, file)
pass
print("finish")
pass
webcrawler()
# biququ
#start = "https://www.biququ.la/html/27744/346801.html"
#end = "https://www.biququ.la/html/27744/34680681.html"
# qidian
#catalogUrl = "https://book.qidian.com/info/118447/#Catalog"
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化