加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
process_docx.py 4.31 KB
一键复制 编辑 原始数据 按行查看 历史
曾振宇 提交于 2021-04-20 16:12 . 获得图片尺寸以及副标题
# 处理word文档
import docx
import os
import re
from PIL import Image
class article:
"""构造article类
属性:title,paragraphs,count_pics,
have_pics,count_subtitle"""
def __init__(self,title,paras=[],count_pics=0):
self.title = title
self.paragraphs = paras
self.count_pics = count_pics
self.have_subtitle = False
self.count_sub_title = 0
self.sub_title = ''
self.pic =[]
if count_pics > 0:
self.have_pics = True
else:
self.have_pics = False
def get_picture(self,pic_size):
if self.have_pics:
self.pic = pic_size[0:self.count_pics]
pic_size[0:self.count_pics] = []
def set_count(self):
"""设置标题和正文字数"""
self.count_title_char = len(self.title)
count = 0
for para in self.paragraphs:
count += len(para)
self.count_paras_char = count
def set_default_size(self):
"""计算默认大小"""
pt = 25.4 / 72.7
char_size = 10.5 * pt
title_size = 56 * pt
subtitle_size = 26 * pt
self.paras_space = self.count_paras_char * 1.56 * (char_size ** 2)#正文
self.title_space = 2 * (title_size ) * (420 - 33.6) * 0.15#大标题
self.sub_title_space = 2 * (subtitle_size) * (420 - 33.6) * 0.15#子标题
self.paras_gap_space = len(self.paragraphs)*1.5*char_size * (420 - 33.6) * 0.15 #段落间隙
if self.have_pics:
self.total_space = self.paras_space * 1.167 + self.title_space + self.sub_title_space + self.paras_gap_space
else:
self.total_space = self.paras_space + self.title_space + self.sub_title_space + self.paras_gap_space
def get_title_and_para(docx_path):
"""获取word文本标题以及段落内容
返回字典(index:article)"""
result_dict = dict()
count = 0
file = docx.Document(docx_path)
for para in file.paragraphs:#遍历每一段内容,为title和paragraphs,以及图片注释
if para.text:
if para.style.name.startswith('Heading 1'):
count += 1
atcl = article(para.text,[])
result_dict[count] = atcl
else:
if para.style.name.startswith('Heading 2'):
# atcl.paragraphs.append('title' + para.text)
atcl.sub_title = para.text
atcl.have_subtitle = True
atcl.count_sub_title += 1
elif para.text.startswith(f'图片{count}'):
atcl.have_pics = True
atcl.count_pics += 1
else:
atcl.paragraphs.append(para.text)
return result_dict
def get_pictures(word_path, result_path):
"""从word中获得图片,存储在目标路径
图片命名为文件名_图片名(image+n)"""
doc = docx.Document(word_path)
dict_rel = doc.part._rels
for rel in dict_rel:
rel = dict_rel[rel]
if "image" in rel.target_ref:#检查word是否含有图片
if not os.path.exists(result_path):
os.makedirs(result_path)
img_name = re.findall("/(.*)", rel.target_ref)[0]
word_name = os.path.splitext(word_path)[0]
if os.sep in word_name:
new_name = word_name.split('\\')[-1]
else:
new_name = word_name.split('/')[-1]
img_name = f'{new_name}_{img_name}'#命名图片
with open(f'{result_path}/{img_name}', "wb") as f:
f.write(rel.target_part.blob)
img_paths = ['./img/'+name for name in os.listdir('./img')]
result = []
for img_path in img_paths:
img = Image.open(img_path)
imgSize = img.size #大小/尺寸
w = img.width #图片的宽
h = img.height #图片的高
result.append((w,h))
return result
def prn_obj(obj):
"""打印对象属性"""
print ('\n'.join(['%s:%s' % item for item in obj.__dict__.items()]) )
if __name__ == '__main__':
result = get_title_and_para('./test.docx')
pic_size = get_pictures('./test.docx','./img')
print(len(result))
for n in result:
result[n].set_count()
result[n].set_default_size()
result[n].get_picture(pic_size)
prn_obj(result[n])
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化