master

分支 (1)

管理

管理

master

tencent-ocr-analysis
/
parseFromPdf.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 31 19:41:27 2022

@author: jerry
"""
import json
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.ocr.v20181119 import ocr_client, models

import base64
def image_to_base64(file_path):
    """
    将pdf转为Base64流
    :param pdf_path: PDF文件路径
    :return:
    """
    with open(file_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return str(encoded_string, 'UTF-8')


try:
    # 实例化一个认证对象，入参需要传入腾讯云账户secretId，secretKey,此处还需注意密钥对的保密
    # 密钥可前往https://console.cloud.tencent.com/cam/capi网站进行获取
    cred = credential.Credential("AKIDDG3btue7JEuKUO1OIvHMBk22lFSO3RYO", "atwiDRQ8alfJfkfKiodBSSBeqkNkqeBR")
    # 实例化一个http选项，可选的，没有特殊需求可以跳过
    httpProfile = HttpProfile()
    httpProfile.endpoint = "ocr.tencentcloudapi.com"

    # 实例化一个client选项，可选的，没有特殊需求可以跳过
    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile
    # 实例化要请求产品的client对象,clientProfile是可选的
    client = ocr_client.OcrClient(cred, "ap-shanghai", clientProfile)


    imageurl= image_to_base64("/home/jerry/python/ocr/c1.pdf")

    # 实例化一个请求对象,每个接口都会对应一个request对象,当文件为pdf时，增加ispdf字段
    req = models.GeneralAccurateOCRRequest()
    params = {
        "ImageBase64": imageurl,
        "IsPdf": True,
        "PdfPageNumber": 2
    }
    req.from_json_string(json.dumps(params))

    # 返回的resp是一个GeneralAccurateOCRResponse的实例，与请求对象对应
    resp = client.GeneralAccurateOCR(req)
    # 输出json格式的字符串回包
    json_str=resp.to_json_string()
    print()
    fw= open("res.json", "w")
    fw.write(json_str)
    fw.close()

except TencentCloudSDKException as err:
    print(err)