代码拉取完成,页面将自动刷新
import pprint
import re
import time
import pdfplumber
class ReadPDF():
def __init__(self, pdf_file: str, out_put_file: str):
self.page_num = 0
self.pdf_file = pdf_file
self.out_put_file = out_put_file
self.page_num_file = 'page_num.txt'
self.pattern = r"信息系统项目管理师教程\(第4版\)"
self.mulu ='' # 6-13
self.zhangjie=[]
def read_page_num(self,page_num:int):
with open(self.page_num_file) as f:
lines = f.readlines()
last_line = lines[-1].strip() if lines else ''
self.page_num = int(last_line)
print(self.page_num)
def read_pdf(self):
md = open(self.out_put_file, 'w', encoding='utf-8')
page_num_write = open(self.page_num_file, 'w', encoding='utf-8')
with pdfplumber.open(self.pdf_file) as pdf:
for page in pdf.pages:
# if page.page_number < 4:
if page.page_number <= 197: # 第五章
# print(self.page_num)
self.page_num = self.page_num + 1
continue
# md.write(f'# {self.page_num} \n')
text = page.extract_text()
if page.page_number in[6,7,8,9,10,11,12,13]:
left_column_x_start = 0 # 假设左侧栏从页面左边开始
right_column_x_start = page.width /2 #页面宽度 / 2 # 假设右侧栏从页面中间开始
# 读取左侧栏内容
left_text = page.extract_text(x0=left_column_x_start, x1=right_column_x_start)
print(left_text)
# 读取右侧栏内容
right_text = page.extract_text(x0=right_column_x_start)
md.write(right_text)
self.page_num = self.page_num + 1
continue
self.read_content(md, text)
# md.write(text)
self.page_num = self.page_num + 1
md.write('\n \n')
page_num_write.write(str(self.page_num)+ '\n')
def read_content(self, md, text):
lines = text.split('\n')
line_content = ''
for line in lines:
pattern = r".*信息系统项目管理师教程(第4版)"
match = re.search(self.pattern, line)
if match:
line = line[match.end():]
continue
pattern3 = r"第(\d+)章"
match = re.search(pattern3, line)
if match:
if match[0] in self.zhangjie:
continue
else:
self.zhangjie.append(match[0])
if str(line_content) != '':
line_content.replace('\n','')
md.write(line_content + ' \n')
line_content = ''
md.write('# ' + line + ' \n')
continue
pattern2 = r"\b\d+\.\d+\b"
match = re.search(pattern2, line)
if match:
if line_content !='':
line_content.replace('\n','')
md.write(line_content + '\n')
line_content = ''
md.write('## ' + line+ ' \n')
continue
if '。' in str(line):
index = str(line).index('。')
line_content = line_content+ line[:index+1]
line_content.replace('\n', '')
md.write(line_content + ' \n ')
line_content = ''
line_content = line_content+ line[index+1:]
continue
line_content= line_content + line
read1 = ReadPDF('xxxxxmgl.pdf', out_put_file='xxxxxmgl.md')
# read1.read_page_num()
read1.read_pdf()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。