代码拉取完成,页面将自动刷新
from logging import DEBUG
import fpdf
import urllib3, os, shutil
from bs4 import BeautifulSoup
from io import BytesIO
from PIL import Image
from fpdf import FPDF
import re
# 文件名合法
def toValidateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title)
return new_title
# 获取页面标题
def getPageTitle():
return soup.find_all(property="og:title")[0].attrs['content']
# 判断图片元素
def isValidImage(tag):
return tag.name == "img" and tag.has_attr('data-src')
# 获取试卷图片
def getPaperImageElements():
return soup.find_all(isValidImage)
# 从图像对象得到URL
def getUrlFromImageElements(imageElements=[]):
urls=[]
for imageElement in imageElements:
url = imageElement.attrs['data-src']
if url:
urls.append(url)
print('发现图片文件: ' + url)
return urls
# 下载webp并转换为jpg
def downloadWebpToJpg(url, dir, name):
try:
img = http.request('GET', url)
byte_stream = BytesIO(img.data)
im = Image.open(byte_stream)
if im.mode == "RGBA":
im.load()
background = Image.new("RGB", im.size, (255, 255, 255))
background.paste(im, mask=im.split()[3])
im.save('./{}/{}.png'.format(dir,name), 'PNG')
print('下载完成:' + name + '.png')
except:
print("下载失败: " + url)
# 将文件夹内所有图片转换为pdf
# https://www.coder.work/article/5010071
def dirImagesToPdf(tmpdir, name):
print("正在合并pdf...")
pdf = FPDF()
pdf.set_auto_page_break(0)
images = [i for i in os.listdir(tmpdir)]
images.sort(key = lambda x: int(x[:-4]))
for img in images:
imgpath=os.path.join(tmpdir, img)
cover = Image.open(imgpath)
width, height = cover.size
# 过滤分页符
if height < 120 or cover.format == 'GIF':
continue
width, height = float(width * 0.264583), float(height * 0.264583)
pdf_size = {'P': {'w': 210, 'h': 297}, 'L': {'w': 297, 'h': 210}}
orientation = 'P' if width < height else 'L'
width = width if width < pdf_size[orientation]['w'] else pdf_size[orientation]['w']
height = height if height < pdf_size[orientation]['h'] else pdf_size[orientation]['h']
pdf.add_page(orientation=orientation)
pdf.image(imgpath, 0, 0, width, height)
pdf.output(toValidateTitle(name))
print("合并完成")
print('神墙PDF v0.0.1 by NekoMoYi')
print('可以下载大部分公众号内的试卷,目前会有一些多余的图,未来将会清除')
print('如果下载来格式明显不对,说明不支持,请换一份试卷试试')
http = urllib3.PoolManager()
try:
response = http.request('GET', input('公众号分享链接:'))
if response.status == 200:
pageContent = response.data.decode('utf-8')
soup = BeautifulSoup(pageContent, 'html.parser')
else:
print('网络错误!')
except:
print('网络错误!')
# MAIN
tmpdir= "./temp"
if os.path.exists(tmpdir):
shutil.rmtree(tmpdir)
os.mkdir(tmpdir)
imgs = getPaperImageElements()
urls = getUrlFromImageElements(imgs)
cnt=0
for url in urls:
downloadWebpToJpg(url, tmpdir, str(cnt))
cnt=cnt+1
dirImagesToPdf(tmpdir, getPageTitle()+".pdf")
os.system('pause')
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。