File size: 6,382 Bytes
7ec53ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# pip3 install "modelscope==1.7.2rc0" -f
# pip install pdfplumber
import glob
import pdfplumber
import re
import os
def check_lines(page, top, buttom):
lines = page.extract_words()[::]
text = ''
last_top = 0
last_check = 0
for each_line in lines:
if top == '' and buttom == '':
if abs(last_top - each_line['top']) <= 2:
text = text + each_line['text']
elif last_check > 0 and not re.search('(?:。|;|\d|报告全文)$', text):
text = text + each_line['text']
else:
text = text + '\n' + each_line['text']
elif top == '':
if each_line['top'] > buttom:
if abs(last_top - each_line['top']) <= 2:
text = text + each_line['text']
elif last_check > 0 and not re.search('(?:。|;|\d|报告全文)$', text):
text = text + each_line['text']
else:
text = text + '\n' + each_line['text']
else:
if each_line['top'] < top and each_line['top'] > buttom:
if abs(last_top - each_line['top']) <= 2:
text = text + each_line['text']
elif last_check > 0 and not re.search('(?:。|;|\d|报告全文)$', text):
text = text + each_line['text']
else:
text = text + '\n' + each_line['text']
last_top = each_line['top']
last_check = each_line['x1'] - page.width * 0.85
return text
def change_pdf_to_txt(name):
pdf = pdfplumber.open(name)
all_text = {}
allrow = 0
for i in range(len(pdf.pages)):
page = pdf.pages[i]
buttom = 0
tables = page.find_tables()
if len(tables) >= 1:
count = len(tables)
for table in tables:
if table.bbox[3] < buttom:
pass
else:
count = count - 1
top = table.bbox[1]
text = check_lines(page, top, buttom)
text_list = text.split('\n')
for _t in range(len(text_list)):
all_text[allrow] = {}
all_text[allrow]['page'] = page
all_text[allrow]['allrow'] = allrow
all_text[allrow]['type'] = 'text'
all_text[allrow]['inside'] = text_list[_t]
allrow = allrow + 1
buttom = table.bbox[3]
new_table = table.extract()
r_count = 0
for r in range(len(new_table)):
row = new_table[r]
if row[0] == None:
r_count = r_count + 1
for c in range(len(row)):
if row[c] != None and row[c] != '' and row[c] != ' ':
if new_table[r - r_count][c] == None:
new_table[r - r_count][c] = row[c]
else:
new_table[r - r_count][c] = new_table[r -
r_count][c] + row[c]
new_table[r][c] = None
else:
r_count = 0
end_table = []
for row in new_table:
if row[0] != None:
cell_list = []
for cell in row:
if cell != None:
cell = cell.replace('\n', '')
else:
cell = ''
cell_list.append(cell)
end_table.append(cell_list)
for row in end_table:
all_text[allrow] = {}
all_text[allrow]['page'] = page
all_text[allrow]['allrow'] = allrow
all_text[allrow]['type'] = 'excel'
all_text[allrow]['inside'] = str(row)
allrow = allrow + 1
if count == 0:
text = check_lines(page, '', buttom)
text_list = text.split('\n')
for _t in range(len(text_list)):
all_text[allrow] = {}
all_text[allrow]['page'] = page
all_text[allrow]['allrow'] = allrow
all_text[allrow]['type'] = 'text'
all_text[allrow]['inside'] = text_list[_t]
allrow = allrow + 1
else:
text = check_lines(page, '', '')
text_list = text.split('\n')
for _t in range(len(text_list)):
all_text[allrow] = {}
all_text[allrow]['page'] = page
all_text[allrow]['allrow'] = allrow
all_text[allrow]['type'] = 'text'
all_text[allrow]['inside'] = text_list[_t]
allrow = allrow + 1
save_path_1 = f'{folder_path}\\' + \
name.split('\\')[-1].replace('.pdf', '.txt')
save_path_2 = f'{folder_path}\\' + \
name.split('\\')[-1].replace('.pdf', '_txt.txt')
for key in all_text.keys():
with open(save_path_1, 'a+', encoding='utf-8') as file:
file.write(str(all_text[key]) + '\n')
with open(save_path_2, 'a+', encoding='utf-8') as file:
file.write(str(all_text[key]['inside']) + '\n')
folder_path = '新建文件夹'
# 获取文件夹内所有文件名称
file_names = glob.glob(folder_path + '/*')
file_names = sorted(file_names, reverse=True)
print(file_names)
# 打印文件名称
name_list = []
for file_name in file_names:
print(file_name)
try:
name_list.append(file_name)
allname = file_name.split('\\')[-1]
date = allname.split('__')[0]
name = allname.split('__')[1]
year = allname.split('__')[4]
change_pdf_to_txt(file_name)
except Exception as e:
print(f"发生bug: {e}")
|