加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
main.py 6.75 KB
一键复制 编辑 原始数据 按行查看 历史
6han 提交于 2021-06-15 04:50 . 初始
# -*- coding: utf-8 -*-
'''
Created : 2021-05-31
@author: hdh
'''
import re
import shutil
from argparse import ArgumentParser
from datetime import datetime
from time import sleep
from docx.shared import Inches
from docxtpl import DocxTemplate, RichText, InlineImage
from docx import Document
from win32com import client as wc
import sys
import os
from docxcompose.composer import Composer
# import traceback
home = os.getcwd()
output_dir = f'{home}\Output\\'
print(f"工作目录为:{home}")
def parse_args():
"""
parse args .
Args:
Returns:
args.
Examples:
>>> parse_args()
"""
# 指定源目录
parser = ArgumentParser(description="文档转换工具说明.\n \
1.请先保存wps,word应用程序的工作并关闭应用。\n \
2.请将需要转换的文档放入当前路径的SourceFiles文件夹下。 \n \
3.若需调整目标文档的格式,请编辑Templates文件夹中的resume.docx文件。\n \
4.头像图片必须存在,默认第一张为证件照,若缺失需用空白图片占位或者自定义修改。 \n \
5.报错文档请手工处理。 \n \
6.为避免源文档存在特殊情况,文档生成后,最好人工核对。")
args = parser.parse_args()
return
# 计算年龄
def calculate_age(birth):
birth_d = datetime.strptime(birth, "%Y.%m.%d")
today_d = datetime.now()
birth_t = birth_d.replace(year=today_d.year)
if today_d > birth_t:
age = today_d.year - birth_d.year
else:
age = today_d.year - birth_d.year - 1
return age
# 转换doc至docx格式
def conv_format(filename, target_filename):
word = wc.Dispatch('Word.Application')
doc = word.Documents.Open(filename)
doc.SaveAs(target_filename, 12) # 转化后路径下的文件
doc.Close()
word.Quit()
def get_pictures(word_path, target_path):
"""
图片提取
:param word_path: word路径
:return:
"""
if not os.path.exists(target_path):
os.makedirs(target_path)
try:
doc = Document(word_path)
dict_rel = doc.part._rels
for rel in dict_rel:
rel = dict_rel[rel]
if "image" in rel.target_ref:
img_name = re.findall("/(.*)", rel.target_ref)[0]
word_name = os.path.splitext(word_path)[0]
if os.sep in word_name:
new_name = word_name.split('\\')[-1]
else:
new_name = word_name.split('/')[-1]
img_name = f'{new_name}' + f'{img_name}'
with open(f'{target_path}/{img_name}', "wb") as f:
f.write(rel.target_part.blob)
except:
pass
def convert_resume(doc_file):
# 设置模板文件
tpl = DocxTemplate(home + r'\Templates\resume.docx')
document = Document(doc_file)
table = document.tables[0]
# 输出文件设置
(filepath, tempfilename) = os.path.split(doc_file)
output_file = home + '\Output\\' + tempfilename
# 计算年龄
birthdate = f'{table.cell(1, 1).text[6:10]}.{table.cell(1, 1).text[10:12]}.{table.cell(1, 1).text[12:14]}'
age = calculate_age(birthdate)
# 组合元素,转换
context = {
'name': table.cell(0, 1).text,
'birthdate': birthdate,
'age': age,
'idno': table.cell(1, 1).text,
'nationality': table.cell(1, 4).text,
'school': table.cell(2, 1).text,
'major': table.cell(3, 1).text,
'pol_status': table.cell(3, 4).text,
'education_background': table.cell(4, 1).text,
'degree': table.cell(4, 4).text,
'years': table.cell(5, 1).text,
'job_title': table.row_cells(12)[3].text,
'company': '北京南天软件有限公司',
'project_title': table.cell(5, 7).text,
'career': table.cell(6, 1).text,
'education': table.cell(8, 1).text,
'training': table.cell(9, 1).text if len(table.cell(9, 1).text) > 0 else '无',
'professional_certificate': table.cell(10, 1).text if len(table.cell(10, 1).text) > 0 else '无'
}
projects = []
for row in range(12, len(table.rows)):
project = {}
project['period'] = table.row_cells(row)[0].text
project['name'] = table.row_cells(row)[1].text
project['role'] = table.row_cells(row)[3].text
project['remark'] = table.row_cells(row)[6].text
projects.append(project)
context['projects'] = projects
# 处理图片
# 图片临时目录
image_tmp = f'{home}\SourceFiles\image_tmp\\'
get_pictures(doc_file, image_tmp)
cnt = 1
images = []
for image in os.listdir(image_tmp):
if cnt != 1:
dic = {'image': InlineImage(tpl, f'{image_tmp}{image}', width=Inches(5))}
images.append(dic)
else:
# 默认第一张为证件照,若缺失需用空白图片占位或者自定义修改
context['profile_photo'] = InlineImage(tpl, f'{image_tmp}{image}', width=Inches(1))
cnt += 1
context['images'] = images
# 返回对象
try:
tpl.render(context)
tpl.save(output_file)
except Exception as e:
print(repr(e))
# print(traceback.print_exc())
input('按回车键继续...')
# 删除临时图片目录
shutil.rmtree(image_tmp)
print(doc_file, "转换完成。")
def main():
args = parse_args()
print('请先保存wps,word应用程序的工作并关闭应用')
input('按回车键继续...')
# 设置输出路径
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.makedirs(output_dir)
# 遍历源路径,对需要处理的文件进行转换
folder = home + '\SourceFiles'
for root, dirs, files in os.walk(folder):
for file in files:
filename = os.path.join(root, file)
# 如果是doc后缀,改为docx后缀
if filename.endswith('.doc'):
new_filename = filename + u'x'
conv_format(filename, new_filename)
sleep(1)
print(f"转换文档{new_filename}")
convert_resume(new_filename)
os.remove(new_filename)
# 对docx文件进行转换
elif filename.endswith('.docx'):
print(f"转换文档{filename}")
convert_resume(filename)
# 合并文件,生成Result.docx
document_main = Document()
cp = Composer(document_main)
for file in os.listdir(output_dir):
cp.append(Document(f'{output_dir}{file}'))
document_main.save(f'{output_dir}Result.docx')
print(f'全部完成。输出文件:{output_dir},合并文件:Result.docx')
input('按回车键退出...')
if __name__ == "__main__":
sys.path = sys.path[1:] # avoid the impact of relative path env, only affect this process
main()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化