FrankWu
/

Application

Model card Files Files and versions Community

File size: 8,739 Bytes

e2dccf7

# -*- coding: utf-8 -*-

'''
Created by Shengbo.Zhang on 2021/08/13
'''


import sys
import time


##################################################
############## 算法：PDF2TXT_V3.py ################
##############       测试示例      ################
##################################################
from Pdf2Txt.pdf2txt_v1 import find_all_local_file
from Pdf2Txt.pdf2txt_v3 import *
while True:
    count_total = 0
    count_success = 0
    count_failed = 0

    test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径（输入exit退出）: ')
    if test_file_dir == 'exit':
        sys.exit()

    print('*****************************************************')
    t1 = time.time()
    for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
        count_total += 1

        pdf_file_path = path
        pdf_dir_path = os.path.dirname(path)
        pdf_file_name = os.path.basename(path)[:-4]
        output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"

        print(f'开始处理: 第 {idx + 1} 个文件...')
        print(f'文件名: {pdf_file_name}.pdf')
        tt1 = time.time()
        try:
            txt_string = get_txt_from_pdf(pdf_file_path)
            if txt_string != '':
                output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string)
                count_success += 1
                print('处理成功.')
            else:
                count_failed += 1
                print('处理失败！')
        except Exception as e:
            print(e)
            count_failed += 1
            print('处理失败！')
        tt2 = time.time()
        print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒')

        print('*****************************************************')

    t2 = time.time()
    print('\n所有PDF格式的公告文件已处理完毕！')
    print(f'文件总数：{count_total}，处理成功：{count_success}，处理失败：{count_failed}')
    print('执行耗时：', round(t2-t1, 3), '秒')
    print('平均耗时：', round((t2-t1)/count_total, 3), '秒/个')



# ##################################################
# ############## 算法：PDF2TXT_V2.py ################
# ##############       测试示例      ################
# ##################################################
# from Pdf2Txt.pdf2txt_v1 import find_all_local_file
# from Pdf2Txt.pdf2txt_v2 import *
# while True:
#     count_total = 0
#     count_success = 0
#     count_failed = 0
#
#     test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径（输入exit退出）: ')
#     if test_file_dir == 'exit':
#         sys.exit()
#
#     print('*****************************************************')
#     t1 = time.time()
#     for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
#         count_total += 1
#
#         pdf_file_path = path
#         pdf_dir_path = os.path.dirname(path)
#         pdf_file_name = os.path.basename(path)[:-4]
#         output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
#
#         print(f'开始处理: 第 {idx + 1} 个文件...')
#         print(f'文件名: {pdf_file_name}.pdf')
#         tt1 = time.time()
#         try:
#             txt_string = get_txt_from_pdf(pdf_file_path)
#             if txt_string != '':
#                 output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string)
#                 count_success += 1
#                 print('处理成功.')
#             else:
#                 count_failed += 1
#                 print('处理失败！')
#         except Exception as e:
#             print(e)
#             count_failed += 1
#             print('处理失败！')
#         tt2 = time.time()
#         print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒')
#
#         print('*****************************************************')
#
#     t2 = time.time()
#     print('\n所有PDF格式的公告文件已处理完毕！')
#     print(f'文件总数：{count_total}，处理成功：{count_success}，处理失败：{count_failed}')
#     print('执行耗时：', round(t2-t1, 3), '秒')
#     print('平均耗时：', round((t2-t1)/count_total, 3), '秒/个')



# ##################################################
# ############## 算法：PDF2TXT_V1.py ################
# ##############       测试示例      ################
# ##################################################
# from Pdf2Txt.pdf2txt_v1 import *
# while True:
#     count_total = 0
#     count_success = 0
#     count_failed = 0
#
#     test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径（输入exit退出）: ')
#     if test_file_dir == 'exit':
#         sys.exit()
#     txt_output_mode = input('\n请选择TXT输出模式: 1. 带段头段尾表标识符  2. 不带段头段尾标识符（默认，按enter键） ')
#     if txt_output_mode == '1':
#         txt_output_mode = True
#     else:
#         txt_output_mode = False
#
#     print('*****************************************************')
#     for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
#         count_total += 1
#
#         pdf_file_path = path
#         pdf_dir_path = os.path.dirname(path)
#         pdf_file_name = os.path.basename(pdf_file_path)[:-4]
#         output_docx_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.docx"
#         output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
#         output_csv_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.csv"
#
#         t1 = time.time()
#         is_success = get_docx_from_pdf(pdf_path=pdf_file_path, out_path=output_docx_file_path)
#         t2 = time.time()
#         print(f'开始处理: 第 {idx + 1} 个文件...')
#         print(f'文件名: {pdf_file_name}.pdf')
#         print('步骤-1: 公告pdf文件已转换为docx格式并进行页数校验！')
#         print('--> 执行耗时:', int((t2 - t1) * 1000.0), 'ms')
#
#         if not is_success:
#
#             count_failed += 1
#             print(f'文件: {pdf_file_path}')
#             print('错误: 原始pdf与生成的docx文件页数校验失败，拒绝进行下一步转换.')
#             # 校验失败的原因在于pdf2docx有暂无法处理少量包含特殊layout的pdf文件，待原作者更新；
#             # 若发生校验失败，后续可考虑直接丢弃该公告数据，或使用_get_txt_from_pdf()函数作直接转换。
#
#         else:
#
#             document = Document(output_docx_file_path)
#
#             is_success, txt_list = get_txt_from_docx(doc=document)
#             t3 = time.time()
#             print('步骤-2: 公告docx文件的段落提取与格式化已完成！')
#             print('--> 执行耗时:', int((t3 - t2) * 1000.0), 'ms')
#
#             if not is_success:
#                 count_failed += 1
#                 print(f'文件: {pdf_file_path}')
#                 print('错误: 原始docx转换为txt文本的过程中出错，拒绝进行下一步转换.')
#             else:
#                 txt_list, attach_list = get_table_from_docx(doc=document, txt=txt_list, out_path=output_csv_file_path,
#                                                             is_out_flag=False)
#                 t4 = time.time()
#                 print('步骤-3: 公告docx文件的表格提取与格式化已完成！')
#                 print('--> 执行耗时:', int((t4 - t3) * 1000.0), 'ms')
#
#                 txt_list = refine_pdf2txt_list_result(txt=txt_list, att_txt=attach_list)
#                 t5 = time.time()
#                 print('步骤-4: 公告txt文件的校对已完成！')
#                 print('--> 执行耗时:', int((t5 - t4) * 1000.0), 'ms')
#
#                 write_pdf2txt_list_result(out_path=output_txt_file_path, txt=txt_list, out_mode_flag=txt_output_mode)
#                 str_result = get_pdf2txt_str_result(txt=txt_list, out_mode_flag=txt_output_mode)
#                 t6 = time.time()
#                 print('步骤-5: 公告txt文件的输出已完成！')
#                 print('--> 执行耗时:', int((t6 - t5) * 1000.0), 'ms')
#
#                 print('----> 总运行时间:', int((t6 - t1) * 1000.0), 'ms')
#                 count_success += 1
#
#         if os.path.exists(output_docx_file_path):
#             os.remove(output_docx_file_path)
#         if os.path.exists(output_csv_file_path):
#             os.remove(output_csv_file_path)
#         print('*****************************************************')
#
#     print('\n所有PDF格式的公告文件已处理完毕！')
#     print(f'【文件总数：{count_total}，处理成功：{count_success}，处理失败：{count_failed}】')