加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
read.py 3.75 KB
一键复制 编辑 原始数据 按行查看 历史
liuhuajian 提交于 2024-03-28 17:32 . ok
import pprint
import re
import time
import pdfplumber
class ReadPDF():
def __init__(self, pdf_file: str, out_put_file: str):
self.page_num = 0
self.pdf_file = pdf_file
self.out_put_file = out_put_file
self.page_num_file = 'page_num.txt'
self.pattern = r"信息系统项目管理师教程\(第4版\)"
self.mulu ='' # 6-13
self.zhangjie=[]
def read_page_num(self,page_num:int):
with open(self.page_num_file) as f:
lines = f.readlines()
last_line = lines[-1].strip() if lines else ''
self.page_num = int(last_line)
print(self.page_num)
def read_pdf(self):
md = open(self.out_put_file, 'w', encoding='utf-8')
page_num_write = open(self.page_num_file, 'w', encoding='utf-8')
with pdfplumber.open(self.pdf_file) as pdf:
for page in pdf.pages:
# if page.page_number < 4:
if page.page_number <= 197: # 第五章
# print(self.page_num)
self.page_num = self.page_num + 1
continue
# md.write(f'# {self.page_num} \n')
text = page.extract_text()
if page.page_number in[6,7,8,9,10,11,12,13]:
left_column_x_start = 0 # 假设左侧栏从页面左边开始
right_column_x_start = page.width /2 #页面宽度 / 2 # 假设右侧栏从页面中间开始
# 读取左侧栏内容
left_text = page.extract_text(x0=left_column_x_start, x1=right_column_x_start)
print(left_text)
# 读取右侧栏内容
right_text = page.extract_text(x0=right_column_x_start)
md.write(right_text)
self.page_num = self.page_num + 1
continue
self.read_content(md, text)
# md.write(text)
self.page_num = self.page_num + 1
md.write('\n \n')
page_num_write.write(str(self.page_num)+ '\n')
def read_content(self, md, text):
lines = text.split('\n')
line_content = ''
for line in lines:
pattern = r".*信息系统项目管理师教程(第4版)"
match = re.search(self.pattern, line)
if match:
line = line[match.end():]
continue
pattern3 = r"第(\d+)章"
match = re.search(pattern3, line)
if match:
if match[0] in self.zhangjie:
continue
else:
self.zhangjie.append(match[0])
if str(line_content) != '':
line_content.replace('\n','')
md.write(line_content + ' \n')
line_content = ''
md.write('# ' + line + ' \n')
continue
pattern2 = r"\b\d+\.\d+\b"
match = re.search(pattern2, line)
if match:
if line_content !='':
line_content.replace('\n','')
md.write(line_content + '\n')
line_content = ''
md.write('## ' + line+ ' \n')
continue
if '。' in str(line):
index = str(line).index('。')
line_content = line_content+ line[:index+1]
line_content.replace('\n', '')
md.write(line_content + ' \n ')
line_content = ''
line_content = line_content+ line[index+1:]
continue
line_content= line_content + line
read1 = ReadPDF('xxxxxmgl.pdf', out_put_file='xxxxxmgl.md')
# read1.read_page_num()
read1.read_pdf()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化