Fetch the repository succeeded.
This action will force synchronization from 张翔宇/数据分析_1, which will overwrite any changes that you have made since you forked the repository, and can not be recovered!!!
Synchronous operation will process in the background and will refresh the page when finishing processing. Please be patient.
import os
import pdfplumber
def pdf2txt(ThePath:str,aim_path:str ):
def parsePDF(dir, file, pdf_path, txtpath):
try:
with open(txtpath, "w", encoding='utf-8') as txt:
with pdfplumber.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages):
print(f'正在转换{dir}-{file}-{i + 1}页')
txt.write(page.extract_text())
except Exception as e:
print(f"发生异常: {str(e)}")
# 主程序
ThePath = r'/Users/liujingyu/Downloads/gitee/cipin/3/pdf/2007年报'
aim_path = r'/Users/liujingyu/Downloads/gitee/cipin/3/txt'
dir_list = os.listdir(ThePath)
for dir in dir_list:
dir_path = os.path.join(ThePath, dir) # 使用os.path.join来构建目录路径
if not os.path.exists(os.path.join(aim_path, dir)):
os.makedirs(os.path.join(aim_path, dir)) # 使用os.path.join来构建目录路径
for root, dirs, files in os.walk(dir_path, topdown=False):
for file in files:
try:
pdf_path = os.path.join(root, file) # 使用os.path.join来构建文件路径
file_name = os.path.basename(pdf_path) # 从文件路径中提取文件名
txt_path = os.path.join(aim_path, dir, file_name.split('.')[0] + '.txt') # 构建txt文件路径
parsePDF(dir, file, pdf_path, txt_path)
except:
with open('错误信息.txt', 'a', encoding='UTF-8', errors='ignore') as f:
f.write(os.path.join(root, file) + '/n') # 使用os.path.join来构建文件路径
pdf2txt(ThePath,aim_path)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。