master

分支 (1)

标签 (1)

管理

管理

master

春风得意马蹄疾

god-wall-pdf
/
GodWallPdf.py

from logging import DEBUG
import fpdf
import urllib3, os, shutil
from bs4 import BeautifulSoup
from io import BytesIO
from PIL import Image
from fpdf import FPDF
import re

# 文件名合法
def toValidateTitle(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "_", title)
    return new_title

# 获取页面标题
def getPageTitle():
    return soup.find_all(property="og:title")[0].attrs['content']

# 判断图片元素
def isValidImage(tag):
    return tag.name == "img" and tag.has_attr('data-src')

# 获取试卷图片
def getPaperImageElements():
    return soup.find_all(isValidImage)

# 从图像对象得到URL
def getUrlFromImageElements(imageElements=[]):
    urls=[]
    for imageElement in imageElements:
        url = imageElement.attrs['data-src']
        if url:
            urls.append(url)
            print('发现图片文件: ' + url)
    return urls

# 下载webp并转换为jpg
def downloadWebpToJpg(url, dir, name):
    try:
        img = http.request('GET', url)
        byte_stream = BytesIO(img.data)
        im = Image.open(byte_stream)
        if im.mode == "RGBA":
            im.load()
            background = Image.new("RGB", im.size, (255, 255, 255))
            background.paste(im, mask=im.split()[3])
        im.save('./{}/{}.png'.format(dir,name), 'PNG')
        print('下载完成：' + name + '.png')
    except:
        print("下载失败: " + url)

# 将文件夹内所有图片转换为pdf
# https://www.coder.work/article/5010071
def dirImagesToPdf(tmpdir, name):
    print("正在合并pdf...")
    pdf = FPDF()
    pdf.set_auto_page_break(0)
    images = [i for i in os.listdir(tmpdir)]
    images.sort(key = lambda x: int(x[:-4]))
    for img in images:
        imgpath=os.path.join(tmpdir, img)
        cover = Image.open(imgpath)
        width, height = cover.size
        # 过滤分页符
        if height < 120 or cover.format == 'GIF':
            continue
        width, height = float(width * 0.264583), float(height * 0.264583)
        pdf_size = {'P': {'w': 210, 'h': 297}, 'L': {'w': 297, 'h': 210}}
        orientation = 'P' if width < height else 'L'
        width = width if width < pdf_size[orientation]['w'] else pdf_size[orientation]['w']
        height = height if height < pdf_size[orientation]['h'] else pdf_size[orientation]['h']
        pdf.add_page(orientation=orientation)
        pdf.image(imgpath, 0, 0, width, height)
    pdf.output(toValidateTitle(name))
    print("合并完成")


print('神墙PDF v0.0.1 by NekoMoYi')
print('可以下载大部分公众号内的试卷，目前会有一些多余的图，未来将会清除')
print('如果下载来格式明显不对，说明不支持，请换一份试卷试试')
http = urllib3.PoolManager()
try:
    response = http.request('GET', input('公众号分享链接：'))
    if response.status == 200:
        pageContent = response.data.decode('utf-8')
        soup = BeautifulSoup(pageContent, 'html.parser')
    else:
        print('网络错误!')
except:
    print('网络错误!')

# MAIN

tmpdir= "./temp"
if os.path.exists(tmpdir):
    shutil.rmtree(tmpdir)
os.mkdir(tmpdir)
imgs = getPaperImageElements()
urls = getUrlFromImageElements(imgs)
cnt=0
for url in urls:
    downloadWebpToJpg(url, tmpdir, str(cnt))
    cnt=cnt+1
dirImagesToPdf(tmpdir, getPageTitle()+".pdf")

os.system('pause')