代码拉取完成,页面将自动刷新
# 处理word文档
import docx
import os
import re
from PIL import Image
class article:
"""构造article类
属性:title,paragraphs,count_pics,
have_pics,count_subtitle"""
def __init__(self,title,paras=[],count_pics=0):
self.title = title
self.paragraphs = paras
self.count_pics = count_pics
self.have_subtitle = False
self.count_sub_title = 0
self.sub_title = ''
self.pic =[]
if count_pics > 0:
self.have_pics = True
else:
self.have_pics = False
def get_picture(self,pic_size):
if self.have_pics:
self.pic = pic_size[0:self.count_pics]
pic_size[0:self.count_pics] = []
def set_count(self):
"""设置标题和正文字数"""
self.count_title_char = len(self.title)
count = 0
for para in self.paragraphs:
count += len(para)
self.count_paras_char = count
def set_default_size(self):
"""计算默认大小"""
pt = 25.4 / 72.7
char_size = 10.5 * pt
title_size = 56 * pt
subtitle_size = 26 * pt
self.paras_space = self.count_paras_char * 1.56 * (char_size ** 2)#正文
self.title_space = 2 * (title_size ) * (420 - 33.6) * 0.15#大标题
self.sub_title_space = 2 * (subtitle_size) * (420 - 33.6) * 0.15#子标题
self.paras_gap_space = len(self.paragraphs)*1.5*char_size * (420 - 33.6) * 0.15 #段落间隙
if self.have_pics:
self.total_space = self.paras_space * 1.167 + self.title_space + self.sub_title_space + self.paras_gap_space
else:
self.total_space = self.paras_space + self.title_space + self.sub_title_space + self.paras_gap_space
def get_title_and_para(docx_path):
"""获取word文本标题以及段落内容
返回字典(index:article)"""
result_dict = dict()
count = 0
file = docx.Document(docx_path)
for para in file.paragraphs:#遍历每一段内容,为title和paragraphs,以及图片注释
if para.text:
if para.style.name.startswith('Heading 1'):
count += 1
atcl = article(para.text,[])
result_dict[count] = atcl
else:
if para.style.name.startswith('Heading 2'):
# atcl.paragraphs.append('title' + para.text)
atcl.sub_title = para.text
atcl.have_subtitle = True
atcl.count_sub_title += 1
elif para.text.startswith(f'图片{count}'):
atcl.have_pics = True
atcl.count_pics += 1
else:
atcl.paragraphs.append(para.text)
return result_dict
def get_pictures(word_path, result_path):
"""从word中获得图片,存储在目标路径
图片命名为文件名_图片名(image+n)"""
doc = docx.Document(word_path)
dict_rel = doc.part._rels
for rel in dict_rel:
rel = dict_rel[rel]
if "image" in rel.target_ref:#检查word是否含有图片
if not os.path.exists(result_path):
os.makedirs(result_path)
img_name = re.findall("/(.*)", rel.target_ref)[0]
word_name = os.path.splitext(word_path)[0]
if os.sep in word_name:
new_name = word_name.split('\\')[-1]
else:
new_name = word_name.split('/')[-1]
img_name = f'{new_name}_{img_name}'#命名图片
with open(f'{result_path}/{img_name}', "wb") as f:
f.write(rel.target_part.blob)
img_paths = ['./img/'+name for name in os.listdir('./img')]
result = []
for img_path in img_paths:
img = Image.open(img_path)
imgSize = img.size #大小/尺寸
w = img.width #图片的宽
h = img.height #图片的高
result.append((w,h))
return result
def prn_obj(obj):
"""打印对象属性"""
print ('\n'.join(['%s:%s' % item for item in obj.__dict__.items()]) )
if __name__ == '__main__':
result = get_title_and_para('./test.docx')
pic_size = get_pictures('./test.docx','./img')
print(len(result))
for n in result:
result[n].set_count()
result[n].set_default_size()
result[n].get_picture(pic_size)
prn_obj(result[n])
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。