代码拉取完成,页面将自动刷新
import re
import time
import pdfplumber
page_num = open('page_num.txt', 'a', encoding='utf-8')
print(page_num.readline())
def read_last_line(filename):
last_line = ''
with open(filename, 'r') as file:
for line in file:
last_line = line.strip()
return last_line
file_name = "xxxxxmgl.md"
# 使用 'a' 模式打开文件,如果文件不存在则创建
md = open(file_name, 'a', encoding='utf-8')
page_num= 183
with pdfplumber.open("xxxxxmgl.pdf") as pdf:
for page in pdf.pages:
if page.page_number<page_num :
continue
# 获取当前页的所有图片
for idx, img in enumerate(page.images):
# 保存图片到本地
img_data = img['stream'].rawdata
with open(f"img/image_{page_num}_{idx}.png", "wb") as f:
f.write(img_data)
# im = page.to_image()
# im.save(str(page.page_number)+".jpg")
text=page.extract_text()
index = 0
for img in page.images:
img_data = img['stream'].get_data()
with open("img/"+str(page.page_number)+str(index)+".jpg", "wb") as img_file:
img_file.write(img_data)
# position = text.find("图")
# if position != -1:
# im = page.to_image()
# im.save(str(page.page_number)+".jpg")
# lines = re.split(r'\n+', text) # 使用正则表达式按换行符分割文本
lines = text.split('。')
# lines = re.split(r'(?<=[‘。,?])\s+', text)|
for line in lines:
new_line=line.replace('\n','')
# print(new_line.strip()+'。 ') # 输出每行文本并移除两侧多余的空白字符
print(new_line +'。 ') # 输出每行文本并移除两侧多余的空白字符
md.write(new_line.strip()+'。'+ '' +' \n')
# time.sleep(0.5)
page_num = page_num + 1
md.write('\n \n')
print('============',page_num)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。