代码拉取完成,页面将自动刷新
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
# Python3 标准库
# https://docs.python.org/zh-cn/3/library/index.html
# 解析HTML
#1: 正则表达式大法
#2: requests-html pip install requests-html
#3: BeautifulSoup pip install beautifulsoup4
#4: lxml.XPath pip install lxml
#[](https://imgconvert.csdnimg.cn/aHR0cHM6Ly9pbWcxLnR1aWNvb2wuY29tL25peUlSYkouanBnIXdlYg?x-oss-process=image/format,png)
#5: SGMLParser
#6: HTMLParaer
# version: 3.11.4
# 执行
# 创建空白__init__.py
import requests
import os
import sys
import time
import datetime
import html
import random
import math
import re
import json
import urllib
import shutil
import glob
import zlib
import doctest
import hashlib
from pathlib import Path
from io import StringIO
from bs4 import BeautifulSoup
def getDocumentFromUrl(url):
request = urllib.request.Request(url)
# request.add_header('Host','www.biququ.la')
# request.add_header('Referer','https://www.biququ.la/html/27744/')
request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0')
response = urllib.request.urlopen(request)
response.encoding = 'utf-8'
if response.code == 200:
pass
else:
print("访问%s失败,返回码为:%s", url, response.code)
pass
# utf-8解码,得到中文
htmlString = response.read().decode('utf-8')
htmlString = html.unescape(htmlString)
document = BeautifulSoup(htmlString, 'html.parser')
return document;
pass
def parseBiququ(document):
chapter = ""
title = document.body.select("div.bookname h1")[0].text
chapter += title
chapter += "\n"
contentList = document.body.select("div#content p")
for content in contentList:
chapter += content.text
chapter += "\n"
pass
chapter += "\n"
print(chapter)
return chapter
pass
def write2File(chapter, file):
fo = open(file, mode="a+", encoding = "utf8")
fo.write( chapter )
fo.close
pass
def webcrawler():
file = "星辰变.txt"
for i in range(587, 681):
print(i)
# urlFormat = "https://www.biququ.la/html/27744/34680%d.html"
# url = urlFormat % i
# document = getDocumentFromUrl(url);
# chapter = parseBiququ(document)
# write2File(chapter, file)
pass
print("finish")
pass
webcrawler()
# biququ
#start = "https://www.biququ.la/html/27744/346801.html"
#end = "https://www.biququ.la/html/27744/34680681.html"
# qidian
#catalogUrl = "https://book.qidian.com/info/118447/#Catalog"
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。