master

分支 (1)

管理

管理

master

tencent-ocr-analysis
/
pdf2Json_2.py

#负责将pdf转换成json格式
#需要python3.8,安装包如下：
#pip install  tencentcloud-sdk-python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 31 19:41:27 2022

@author: jerry
"""
import json
import os
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.ocr.v20181119 import ocr_client, models
import time
import base64
import math

#以下数据为c1.pdf解析后获取，为航天计量的证书数据点位置，其中温度数据共9个点，代表9个计量数据
points_array={'sn':[603,875]}
element_array=[[663,1105],[663,1136],[661,1165],[661,1193],
              [661,1225],[660,1254],[659,1286],[655,1315],[655,1346]]

def image_to_base64(file_path):
    """
    将pdf转为Base64流
    :param pdf_path: PDF文件路径
    :return:
    """
    with open(file_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return str(encoded_string, 'UTF-8')

def find_pdf_files(directory):
    """
    遍历指定目录，返回所有PDF文件的绝对路径列表。

    :param directory: 要遍历的目录的路径
    :return: 包含所有PDF文件绝对路径的列表
    """
    print(directory)
    pdf_files = []  # 存储找到的PDF文件的绝对路径
    for root, dirs, files in os.walk(directory):
        for file in files:
            print("ok")
            if file.lower().endswith('.pdf'):  # 检查文件后缀是否为.pdf
                pdf_files.append(os.path.join(root, file))  # 添加文件的绝对路径
    return pdf_files

def ocr_to_jason(filename):
    #如何返回每页的序列化对象？
    try:
        # 实例化一个认证对象，入参需要传入腾讯云账户secretId，secretKey,此处还需注意密钥对的保密
        # 密钥可前往https://console.cloud.tencent.com/cam/capi网站进行获取
        cred = credential.Credential("AKIDDG3btue7JEuKUO1OIvHMBk22lFSO3RYO", "atwiDRQ8alfJfkfKiodBSSBeqkNkqeBR")
        # 实例化一个http选项，可选的，没有特殊需求可以跳过
        httpProfile = HttpProfile()
        httpProfile.endpoint = "ocr.tencentcloudapi.com"

        # 实例化一个client选项，可选的，没有特殊需求可以跳过
        clientProfile = ClientProfile()
        clientProfile.httpProfile = httpProfile
        # 实例化要请求产品的client对象,clientProfile是可选的
        client = ocr_client.OcrClient(cred, "ap-shanghai", clientProfile)


        imageurl= image_to_base64(filename)
        #获取pdf文档的页数,当前暂时定为3页，api里未查到页数的获取
        pdf_page_count=1
        #只解析第三页
        pdf_page_range=[3]
        # pdf_conent_json=[]
        for page_index in (pdf_page_range):
            # 实例化一个请求对象,每个接口都会对应一个request对象,当文件为pdf时，增加ispdf字段
            req = models.GeneralAccurateOCRRequest()
            params = {
                "ImageBase64": imageurl,
                "IsPdf": True,
                "PdfPageNumber": page_index
            }
            req.from_json_string(json.dumps(params))

            # 返回的resp是一个GeneralAccurateOCRResponse的实例，与请求对象对应
            resp = client.GeneralAccurateOCR(req)
            # 输出json格式的字符串回包
            pdf_conent_json=json.loads(resp.to_json_string())
            #加入当前页面解析，写入文件

        #    //写入json目录下的文件

            print(filename)
            str_file=filename.replace(".pdf",".json")
            out_file=str_file.replace(r'pdf\img_pdf',"json")
            print(out_file)
            fw= open(out_file, "w",encoding='utf-8')
            fw.write(resp.to_json_string())
            fw.close()
            # exit()
            # //test end

            # PdfPageNumber=page_index
            # if  (PdfPageNumber==1) :
            #     dat1=page_one(pdf_conent_json)
            #     print(dat1)
            #     print ('当前页数：',PdfPageNumber)
            #     continue
            # if  (PdfPageNumber==3) :
            #     dat2=page_three(pdf_conent_json)
            #     print(dat2)
            #     print ('当前页数：',PdfPageNumber)
            # return
            #写入文件
                # dat=[dat1]+dat2
                # str_csv=[]
                # for istr in  dat:
                #     str_csv.append(istr)
                #     str_csv.append(',')
                # str_csv.pop()
                # filename=time.strftime("%Y-%m-%d",time.localtime())+'.csv'
                # fo = open(filename, "a")
                # print(str_csv)
                # fo.writelines(str_csv)
                # fo.write('\n')
                # fo.close()

        return pdf_conent_json
    except TencentCloudSDKException as err:
        print(err)

    #test begin
    # pdf_json=json.loads(json_str2)
    # print(pdf_json)
    #航天计量的证书解析格式
def page_one(pdf_json):
    #第一页
    #检测单位
    # a1=(pdf_json['TextDetections'][0]['DetectedText'])
    #证书编号
    # a2=(pdf_json['TextDetections'][3]['DetectedText'])
    #出厂编号
    sn='null'
    for itempolys_axis in pdf_json['TextDetections']:
        x_axis=itempolys_axis['ItemPolygon']['X']
        y_axis=itempolys_axis['ItemPolygon']['Y']
        dat1=[x_axis,y_axis]
        # print (dat1)
        if point_is_in_range(dat1, points_array['sn']):
            sn= itempolys_axis['DetectedText']
            break
    return sn

def page_three(pdf_json):
    dat_array=[]
    idx=0
    for itempolys_axis in pdf_json['TextDetections']:
        x_axis=itempolys_axis['ItemPolygon']['X']
        y_axis=itempolys_axis['ItemPolygon']['Y']
        dat1=[x_axis,y_axis]
        # print (dat1)
        if point_is_in_range(dat1, element_array[idx]):
            data=itempolys_axis['DetectedText']
            # print(data)
            dat_array.append(data)
            idx+=1
        # print('idx:',idx)
        ic_count=len(element_array)-1
        if idx>ic_count:
           break
    return dat_array

#判断一点是否在指定范围圆内，用于判断文字坐标是否在指定范围,两点偏差取不到10个点范围(已测试)
def point_is_in_range(dat1,dat2):
    distance=math.sqrt(pow((dat1[0]-dat2[0]),2)+pow((dat1[1]-dat2[1]),2))
    max_enable_Deviance=10
    if distance<=max_enable_Deviance:
        return True
    else:
        return False


# 使用示例,注意文件目录的写法，使用双反斜杠不行
directory_path = (r'C:\Users\awu\projects\tencent-ocr-analysis-master\pdf\img_pdf')
input_pdf_list =find_pdf_files(directory_path)
for input_pdf in input_pdf_list:
    ocr_to_jason(input_pdf)