# -*- coding: utf-8 -*- ''' Created by Shengbo.Zhang on 2021/08/13 ''' import sys import time ################################################## ############## 算法:PDF2TXT_V3.py ################ ############## 测试示例 ################ ################################################## from Pdf2Txt.pdf2txt_v1 import find_all_local_file from Pdf2Txt.pdf2txt_v3 import * while True: count_total = 0 count_success = 0 count_failed = 0 test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ') if test_file_dir == 'exit': sys.exit() print('*****************************************************') t1 = time.time() for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')): count_total += 1 pdf_file_path = path pdf_dir_path = os.path.dirname(path) pdf_file_name = os.path.basename(path)[:-4] output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt" print(f'开始处理: 第 {idx + 1} 个文件...') print(f'文件名: {pdf_file_name}.pdf') tt1 = time.time() try: txt_string = get_txt_from_pdf(pdf_file_path) if txt_string != '': output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string) count_success += 1 print('处理成功.') else: count_failed += 1 print('处理失败!') except Exception as e: print(e) count_failed += 1 print('处理失败!') tt2 = time.time() print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒') print('*****************************************************') t2 = time.time() print('\n所有PDF格式的公告文件已处理完毕!') print(f'文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}') print('执行耗时:', round(t2-t1, 3), '秒') print('平均耗时:', round((t2-t1)/count_total, 3), '秒/个') # ################################################## # ############## 算法:PDF2TXT_V2.py ################ # ############## 测试示例 ################ # ################################################## # from Pdf2Txt.pdf2txt_v1 import find_all_local_file # from Pdf2Txt.pdf2txt_v2 import * # while True: # count_total = 0 # count_success = 0 # count_failed = 0 # # test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ') # if test_file_dir == 'exit': # sys.exit() # # print('*****************************************************') # t1 = time.time() # for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')): # count_total += 1 # # pdf_file_path = path # pdf_dir_path = os.path.dirname(path) # pdf_file_name = os.path.basename(path)[:-4] # output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt" # # print(f'开始处理: 第 {idx + 1} 个文件...') # print(f'文件名: {pdf_file_name}.pdf') # tt1 = time.time() # try: # txt_string = get_txt_from_pdf(pdf_file_path) # if txt_string != '': # output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string) # count_success += 1 # print('处理成功.') # else: # count_failed += 1 # print('处理失败!') # except Exception as e: # print(e) # count_failed += 1 # print('处理失败!') # tt2 = time.time() # print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒') # # print('*****************************************************') # # t2 = time.time() # print('\n所有PDF格式的公告文件已处理完毕!') # print(f'文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}') # print('执行耗时:', round(t2-t1, 3), '秒') # print('平均耗时:', round((t2-t1)/count_total, 3), '秒/个') # ################################################## # ############## 算法:PDF2TXT_V1.py ################ # ############## 测试示例 ################ # ################################################## # from Pdf2Txt.pdf2txt_v1 import * # while True: # count_total = 0 # count_success = 0 # count_failed = 0 # # test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径(输入exit退出): ') # if test_file_dir == 'exit': # sys.exit() # txt_output_mode = input('\n请选择TXT输出模式: 1. 带段头段尾表标识符 2. 不带段头段尾标识符(默认,按enter键) ') # if txt_output_mode == '1': # txt_output_mode = True # else: # txt_output_mode = False # # print('*****************************************************') # for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')): # count_total += 1 # # pdf_file_path = path # pdf_dir_path = os.path.dirname(path) # pdf_file_name = os.path.basename(pdf_file_path)[:-4] # output_docx_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.docx" # output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt" # output_csv_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.csv" # # t1 = time.time() # is_success = get_docx_from_pdf(pdf_path=pdf_file_path, out_path=output_docx_file_path) # t2 = time.time() # print(f'开始处理: 第 {idx + 1} 个文件...') # print(f'文件名: {pdf_file_name}.pdf') # print('步骤-1: 公告pdf文件已转换为docx格式并进行页数校验!') # print('--> 执行耗时:', int((t2 - t1) * 1000.0), 'ms') # # if not is_success: # # count_failed += 1 # print(f'文件: {pdf_file_path}') # print('错误: 原始pdf与生成的docx文件页数校验失败,拒绝进行下一步转换.') # # 校验失败的原因在于pdf2docx有暂无法处理少量包含特殊layout的pdf文件,待原作者更新; # # 若发生校验失败,后续可考虑直接丢弃该公告数据,或使用_get_txt_from_pdf()函数作直接转换。 # # else: # # document = Document(output_docx_file_path) # # is_success, txt_list = get_txt_from_docx(doc=document) # t3 = time.time() # print('步骤-2: 公告docx文件的段落提取与格式化已完成!') # print('--> 执行耗时:', int((t3 - t2) * 1000.0), 'ms') # # if not is_success: # count_failed += 1 # print(f'文件: {pdf_file_path}') # print('错误: 原始docx转换为txt文本的过程中出错,拒绝进行下一步转换.') # else: # txt_list, attach_list = get_table_from_docx(doc=document, txt=txt_list, out_path=output_csv_file_path, # is_out_flag=False) # t4 = time.time() # print('步骤-3: 公告docx文件的表格提取与格式化已完成!') # print('--> 执行耗时:', int((t4 - t3) * 1000.0), 'ms') # # txt_list = refine_pdf2txt_list_result(txt=txt_list, att_txt=attach_list) # t5 = time.time() # print('步骤-4: 公告txt文件的校对已完成!') # print('--> 执行耗时:', int((t5 - t4) * 1000.0), 'ms') # # write_pdf2txt_list_result(out_path=output_txt_file_path, txt=txt_list, out_mode_flag=txt_output_mode) # str_result = get_pdf2txt_str_result(txt=txt_list, out_mode_flag=txt_output_mode) # t6 = time.time() # print('步骤-5: 公告txt文件的输出已完成!') # print('--> 执行耗时:', int((t6 - t5) * 1000.0), 'ms') # # print('----> 总运行时间:', int((t6 - t1) * 1000.0), 'ms') # count_success += 1 # # if os.path.exists(output_docx_file_path): # os.remove(output_docx_file_path) # if os.path.exists(output_csv_file_path): # os.remove(output_csv_file_path) # print('*****************************************************') # # print('\n所有PDF格式的公告文件已处理完毕!') # print(f'【文件总数:{count_total},处理成功:{count_success},处理失败:{count_failed}】')