master

分支 (1)

管理

管理

master

python_tex
/
process_docx.py

# 处理word文档
import docx
import os
import re
from PIL import Image

class article:
    """构造article类
    属性：title，paragraphs，count_pics，
          have_pics,count_subtitle"""
    def __init__(self,title,paras=[],count_pics=0):
        self.title = title
        self.paragraphs = paras
        self.count_pics = count_pics
        self.have_subtitle = False
        self.count_sub_title = 0
        self.sub_title = ''
        self.pic =[]
        if count_pics > 0:
            self.have_pics = True
        else:
            self.have_pics = False

    def get_picture(self,pic_size):
        if self.have_pics:
            self.pic = pic_size[0:self.count_pics]
            pic_size[0:self.count_pics] = []

    def set_count(self):
        """设置标题和正文字数"""
        self.count_title_char = len(self.title)
        count = 0
        for para in self.paragraphs:
            count += len(para)
        self.count_paras_char = count

    def set_default_size(self):
        """计算默认大小"""
        pt = 25.4 / 72.7
        char_size = 10.5 * pt
        title_size = 56 * pt
        subtitle_size = 26 * pt
        self.paras_space = self.count_paras_char * 1.56 * (char_size ** 2)#正文
        self.title_space = 2 * (title_size ) * (420 - 33.6) * 0.15#大标题
        self.sub_title_space = 2 * (subtitle_size) * (420 - 33.6) * 0.15#子标题
        self.paras_gap_space = len(self.paragraphs)*1.5*char_size  * (420 - 33.6) * 0.15 #段落间隙
        if self.have_pics:
            self.total_space = self.paras_space * 1.167 + self.title_space + self.sub_title_space + self.paras_gap_space
        else:
            self.total_space = self.paras_space + self.title_space + self.sub_title_space + self.paras_gap_space

def get_title_and_para(docx_path):
    """获取word文本标题以及段落内容
    返回字典（index：article）"""
    result_dict = dict()
    count = 0
    file = docx.Document(docx_path)
    for para in file.paragraphs:#遍历每一段内容，为title和paragraphs，以及图片注释
        if para.text:
            if para.style.name.startswith('Heading 1'):
                count += 1
                atcl = article(para.text,[])
                result_dict[count] = atcl
            else:
                if para.style.name.startswith('Heading 2'):
                    # atcl.paragraphs.append('title' + para.text)
                    atcl.sub_title = para.text
                    atcl.have_subtitle = True
                    atcl.count_sub_title += 1
                elif para.text.startswith(f'图片{count}'):
                    atcl.have_pics = True
                    atcl.count_pics += 1
                else:
                    atcl.paragraphs.append(para.text)
    return result_dict

def get_pictures(word_path, result_path):
    """从word中获得图片，存储在目标路径
    图片命名为文件名_图片名（image+n）"""
    doc = docx.Document(word_path)
    dict_rel = doc.part._rels
    for rel in dict_rel:
        rel = dict_rel[rel]
        if "image" in rel.target_ref:#检查word是否含有图片
            if not os.path.exists(result_path):
                os.makedirs(result_path)
            img_name = re.findall("/(.*)", rel.target_ref)[0]
            word_name = os.path.splitext(word_path)[0]
            if os.sep in word_name:
                new_name = word_name.split('\\')[-1]
            else:
                new_name = word_name.split('/')[-1]
            img_name = f'{new_name}_{img_name}'#命名图片
            with open(f'{result_path}/{img_name}', "wb") as f:
                f.write(rel.target_part.blob)
    img_paths = ['./img/'+name for name in os.listdir('./img')]
    result = []
    for img_path in img_paths:
        img = Image.open(img_path)
        imgSize = img.size  #大小/尺寸
        w = img.width       #图片的宽
        h = img.height      #图片的高
        result.append((w,h))
    return result

def prn_obj(obj):
    """打印对象属性"""
    print ('\n'.join(['%s:%s' % item for item in obj.__dict__.items()]) )

if __name__ == '__main__':
    result = get_title_and_para('./test.docx')
    pic_size = get_pictures('./test.docx','./img')
    print(len(result))
    for n in result:
        result[n].set_count()
        result[n].set_default_size()
        result[n].get_picture(pic_size)
        prn_obj(result[n])