tradequant
/

code

Model card Files Files and versions Community

File size: 6,382 Bytes

7ec53ba

# pip3 install "modelscope==1.7.2rc0" -f
# pip install pdfplumber
import glob
import pdfplumber
import re
import os


def check_lines(page, top, buttom):
    lines = page.extract_words()[::]
    text = ''
    last_top = 0
    last_check = 0
    for each_line in lines:
        if top == '' and buttom == '':
            if abs(last_top - each_line['top']) <= 2:
                text = text + each_line['text']
            elif last_check > 0 and not re.search('(?:。|；|\d|报告全文)$', text):
                text = text + each_line['text']
            else:
                text = text + '\n' + each_line['text']
        elif top == '':
            if each_line['top'] > buttom:
                if abs(last_top - each_line['top']) <= 2:
                    text = text + each_line['text']
                elif last_check > 0 and not re.search('(?:。|；|\d|报告全文)$', text):
                    text = text + each_line['text']
                else:
                    text = text + '\n' + each_line['text']
        else:
            if each_line['top'] < top and each_line['top'] > buttom:
                if abs(last_top - each_line['top']) <= 2:
                    text = text + each_line['text']
                elif last_check > 0 and not re.search('(?:。|；|\d|报告全文)$', text):
                    text = text + each_line['text']
                else:
                    text = text + '\n' + each_line['text']
        last_top = each_line['top']
        last_check = each_line['x1'] - page.width * 0.85

    return text


def change_pdf_to_txt(name):
    pdf = pdfplumber.open(name)

    all_text = {}
    allrow = 0
    for i in range(len(pdf.pages)):
        page = pdf.pages[i]
        buttom = 0
        tables = page.find_tables()
        if len(tables) >= 1:
            count = len(tables)
            for table in tables:
                if table.bbox[3] < buttom:
                    pass
                else:
                    count = count - 1

                    top = table.bbox[1]
                    text = check_lines(page, top, buttom)
                    text_list = text.split('\n')
                    for _t in range(len(text_list)):
                        all_text[allrow] = {}
                        all_text[allrow]['page'] = page
                        all_text[allrow]['allrow'] = allrow
                        all_text[allrow]['type'] = 'text'
                        all_text[allrow]['inside'] = text_list[_t]
                        allrow = allrow + 1

                    buttom = table.bbox[3]
                    new_table = table.extract()
                    r_count = 0

                    for r in range(len(new_table)):
                        row = new_table[r]
                        if row[0] == None:
                            r_count = r_count + 1
                            for c in range(len(row)):
                                if row[c] != None and row[c] != '' and row[c] != ' ':
                                    if new_table[r - r_count][c] == None:
                                        new_table[r - r_count][c] = row[c]
                                    else:
                                        new_table[r - r_count][c] = new_table[r -
                                                                              r_count][c] + row[c]
                                    new_table[r][c] = None
                        else:
                            r_count = 0
                    end_table = []
                    for row in new_table:
                        if row[0] != None:
                            cell_list = []
                            for cell in row:
                                if cell != None:
                                    cell = cell.replace('\n', '')
                                else:
                                    cell = ''
                                cell_list.append(cell)
                            end_table.append(cell_list)
                    for row in end_table:
                        all_text[allrow] = {}
                        all_text[allrow]['page'] = page
                        all_text[allrow]['allrow'] = allrow
                        all_text[allrow]['type'] = 'excel'
                        all_text[allrow]['inside'] = str(row)
                        allrow = allrow + 1

                    if count == 0:
                        text = check_lines(page, '', buttom)
                        text_list = text.split('\n')
                        for _t in range(len(text_list)):
                            all_text[allrow] = {}
                            all_text[allrow]['page'] = page
                            all_text[allrow]['allrow'] = allrow
                            all_text[allrow]['type'] = 'text'
                            all_text[allrow]['inside'] = text_list[_t]
                            allrow = allrow + 1

        else:
            text = check_lines(page, '', '')
            text_list = text.split('\n')
            for _t in range(len(text_list)):
                all_text[allrow] = {}
                all_text[allrow]['page'] = page
                all_text[allrow]['allrow'] = allrow
                all_text[allrow]['type'] = 'text'
                all_text[allrow]['inside'] = text_list[_t]
                allrow = allrow + 1

    save_path_1 = f'{folder_path}\\' + \
        name.split('\\')[-1].replace('.pdf', '.txt')
    save_path_2 = f'{folder_path}\\' + \
        name.split('\\')[-1].replace('.pdf', '_txt.txt')
    for key in all_text.keys():
        with open(save_path_1, 'a+', encoding='utf-8') as file:
            file.write(str(all_text[key]) + '\n')
        with open(save_path_2, 'a+', encoding='utf-8') as file:
            file.write(str(all_text[key]['inside']) + '\n')


folder_path = '新建文件夹'
# 获取文件夹内所有文件名称
file_names = glob.glob(folder_path + '/*')
file_names = sorted(file_names, reverse=True)
print(file_names)
# 打印文件名称
name_list = []
for file_name in file_names:
    print(file_name)
    try:
        name_list.append(file_name)
        allname = file_name.split('\\')[-1]
        date = allname.split('__')[0]
        name = allname.split('__')[1]
        year = allname.split('__')[4]
        change_pdf_to_txt(file_name)
    except Exception as e:
        print(f"发生bug: {e}")