FrankWu
/

Application

Model card Files Files and versions Community

Application / pdf2txt_test.py

FrankWu

Upload 5 files

e2dccf7 verified over 1 year ago

raw

history blame contribute delete

8.74 kB

	# -- coding: utf-8 --

	'''
	Created by Shengbo.Zhang on 2021/08/13
	'''


	import sys
	import time


	##################################################
	############## 算法：PDF2TXT_V3.py ################
	############## 测试示例 ################
	##################################################
	from Pdf2Txt.pdf2txt_v1 import find_all_local_file
	from Pdf2Txt.pdf2txt_v3 import *
	while True:
	count_total = 0
	count_success = 0
	count_failed = 0

	test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径（输入exit退出）: ')
	if test_file_dir == 'exit':
	sys.exit()

	print('*****************************************************')
	t1 = time.time()
	for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
	count_total += 1

	pdf_file_path = path
	pdf_dir_path = os.path.dirname(path)
	pdf_file_name = os.path.basename(path)[:-4]
	output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"

	print(f'开始处理: 第 {idx + 1} 个文件...')
	print(f'文件名: {pdf_file_name}.pdf')
	tt1 = time.time()
	try:
	txt_string = get_txt_from_pdf(pdf_file_path)
	if txt_string != '':
	output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string)
	count_success += 1
	print('处理成功.')
	else:
	count_failed += 1
	print('处理失败！')
	except Exception as e:
	print(e)
	count_failed += 1
	print('处理失败！')
	tt2 = time.time()
	print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒')

	print('*****************************************************')

	t2 = time.time()
	print('\n所有PDF格式的公告文件已处理完毕！')
	print(f'文件总数：{count_total}，处理成功：{count_success}，处理失败：{count_failed}')
	print('执行耗时：', round(t2-t1, 3), '秒')
	print('平均耗时：', round((t2-t1)/count_total, 3), '秒/个')



	# ##################################################
	# ############## 算法：PDF2TXT_V2.py ################
	# ############## 测试示例 ################
	# ##################################################
	# from Pdf2Txt.pdf2txt_v1 import find_all_local_file
	# from Pdf2Txt.pdf2txt_v2 import *
	# while True:
	# count_total = 0
	# count_success = 0
	# count_failed = 0
	#
	# test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径（输入exit退出）: ')
	# if test_file_dir == 'exit':
	# sys.exit()
	#
	# print('*****************************************************')
	# t1 = time.time()
	# for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
	# count_total += 1
	#
	# pdf_file_path = path
	# pdf_dir_path = os.path.dirname(path)
	# pdf_file_name = os.path.basename(path)[:-4]
	# output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
	#
	# print(f'开始处理: 第 {idx + 1} 个文件...')
	# print(f'文件名: {pdf_file_name}.pdf')
	# tt1 = time.time()
	# try:
	# txt_string = get_txt_from_pdf(pdf_file_path)
	# if txt_string != '':
	# output_txt_string(txt_path=output_txt_file_path, txt_string=txt_string)
	# count_success += 1
	# print('处理成功.')
	# else:
	# count_failed += 1
	# print('处理失败！')
	# except Exception as e:
	# print(e)
	# count_failed += 1
	# print('处理失败！')
	# tt2 = time.time()
	# print('--> 执行耗时:', int((tt2 - tt1) * 1000.0), '毫秒')
	#
	# print('*****************************************************')
	#
	# t2 = time.time()
	# print('\n所有PDF格式的公告文件已处理完毕！')
	# print(f'文件总数：{count_total}，处理成功：{count_success}，处理失败：{count_failed}')
	# print('执行耗时：', round(t2-t1, 3), '秒')
	# print('平均耗时：', round((t2-t1)/count_total, 3), '秒/个')



	# ##################################################
	# ############## 算法：PDF2TXT_V1.py ################
	# ############## 测试示例 ################
	# ##################################################
	# from Pdf2Txt.pdf2txt_v1 import *
	# while True:
	# count_total = 0
	# count_success = 0
	# count_failed = 0
	#
	# test_file_dir = input('\n请输入公告PDF文件所在目录的完整路径（输入exit退出）: ')
	# if test_file_dir == 'exit':
	# sys.exit()
	# txt_output_mode = input('\n请选择TXT输出模式: 1. 带段头段尾表标识符 2. 不带段头段尾标识符（默认，按enter键） ')
	# if txt_output_mode == '1':
	# txt_output_mode = True
	# else:
	# txt_output_mode = False
	#
	# print('*****************************************************')
	# for idx, path in enumerate(find_all_local_file(test_file_dir, '.pdf')):
	# count_total += 1
	#
	# pdf_file_path = path
	# pdf_dir_path = os.path.dirname(path)
	# pdf_file_name = os.path.basename(pdf_file_path)[:-4]
	# output_docx_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.docx"
	# output_txt_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.txt"
	# output_csv_file_path = f"{pdf_dir_path}//{pdf_file_name}_P2T.csv"
	#
	# t1 = time.time()
	# is_success = get_docx_from_pdf(pdf_path=pdf_file_path, out_path=output_docx_file_path)
	# t2 = time.time()
	# print(f'开始处理: 第 {idx + 1} 个文件...')
	# print(f'文件名: {pdf_file_name}.pdf')
	# print('步骤-1: 公告pdf文件已转换为docx格式并进行页数校验！')
	# print('--> 执行耗时:', int((t2 - t1) * 1000.0), 'ms')
	#
	# if not is_success:
	#
	# count_failed += 1
	# print(f'文件: {pdf_file_path}')
	# print('错误: 原始pdf与生成的docx文件页数校验失败，拒绝进行下一步转换.')
	# # 校验失败的原因在于pdf2docx有暂无法处理少量包含特殊layout的pdf文件，待原作者更新；
	# # 若发生校验失败，后续可考虑直接丢弃该公告数据，或使用_get_txt_from_pdf()函数作直接转换。
	#
	# else:
	#
	# document = Document(output_docx_file_path)
	#
	# is_success, txt_list = get_txt_from_docx(doc=document)
	# t3 = time.time()
	# print('步骤-2: 公告docx文件的段落提取与格式化已完成！')
	# print('--> 执行耗时:', int((t3 - t2) * 1000.0), 'ms')
	#
	# if not is_success:
	# count_failed += 1
	# print(f'文件: {pdf_file_path}')
	# print('错误: 原始docx转换为txt文本的过程中出错，拒绝进行下一步转换.')
	# else:
	# txt_list, attach_list = get_table_from_docx(doc=document, txt=txt_list, out_path=output_csv_file_path,
	# is_out_flag=False)
	# t4 = time.time()
	# print('步骤-3: 公告docx文件的表格提取与格式化已完成！')
	# print('--> 执行耗时:', int((t4 - t3) * 1000.0), 'ms')
	#
	# txt_list = refine_pdf2txt_list_result(txt=txt_list, att_txt=attach_list)
	# t5 = time.time()
	# print('步骤-4: 公告txt文件的校对已完成！')
	# print('--> 执行耗时:', int((t5 - t4) * 1000.0), 'ms')
	#
	# write_pdf2txt_list_result(out_path=output_txt_file_path, txt=txt_list, out_mode_flag=txt_output_mode)
	# str_result = get_pdf2txt_str_result(txt=txt_list, out_mode_flag=txt_output_mode)
	# t6 = time.time()
	# print('步骤-5: 公告txt文件的输出已完成！')
	# print('--> 执行耗时:', int((t6 - t5) * 1000.0), 'ms')
	#
	# print('----> 总运行时间:', int((t6 - t1) * 1000.0), 'ms')
	# count_success += 1
	#
	# if os.path.exists(output_docx_file_path):
	# os.remove(output_docx_file_path)
	# if os.path.exists(output_csv_file_path):
	# os.remove(output_csv_file_path)
	# print('*****************************************************')
	#
	# print('\n所有PDF格式的公告文件已处理完毕！')
	# print(f'【文件总数：{count_total}，处理成功：{count_success}，处理失败：{count_failed}】')