Spaces:

vermen
/

extracttable

Sleeping

File size: 2,881 Bytes

bfa8205
 
37e383b
57c12e6
bfa8205
b7f2820
57c12e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca3ccf5
57c12e6
b7f2820
57c12e6
bfa8205
 
b7f2820
37e383b
 
c4bc05d
37e383b
3e20130
bfa8205
 
 
 
 
def3cc8
bfa8205
3e20130
 
bfa8205

from pathlib import Path
import gradio as gr
import fitz
import pandas as pd

def create_excel(doc,name_excel):
    HEADERS = ['TR [min]','Nome','Area','Fator Capacidade','Pratos Teóricos','Sinal-ruído (USP)','Resolução','Assimetría','Altura','Pureza']
    LIMITS = [(17,50),(50,130),(130,184),(184,240),(240,311),(311,360),(330,418),(400,487),(450,533),(500,600)]
    # LIMITS will be used to correctly identify to which block the data pertains
    def within_limits(x,idx_limit):
    	return (x >= LIMITS[idx_limit][0] and x < LIMITS[idx_limit][1])
    # to create the dataframe
    data = {val:list() for val in HEADERS}
    # Extracting text from all pages
    all_text = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text(option = "words")
        # visit each page
        idx = 0
        while (idx < len(blocks)) and (blocks[idx][4] != 'TR'):
        	idx = idx + 1
        # check if the next is [min]
        if (idx + 1 < len(blocks)) and blocks[idx + 1][4] != "[min]":
        	continue
        #
        #print(blocks[idx:(idx+14)])
        idx = idx + 14
        while (idx < len(blocks)):
        	if(blocks[idx][4] == 'Relatório'):
        		break
        	idx_col = 0
        	while (idx_col < len(HEADERS)) and (idx < len(blocks)):
        		if within_limits(blocks[idx][0],idx_col):
        			if idx_col == 1:
        				final_string = "" #blocks[idx][4]
        				while (idx < len(blocks)) and within_limits(blocks[idx][0],idx_col):
        					final_string = final_string + " " + blocks[idx][4]
        					idx = idx + 1
        				data[HEADERS[idx_col]].append(final_string)
        				idx = idx - 1
        			else:
        				data[HEADERS[idx_col]].append(blocks[idx][4])
        			idx = idx + 1
        		else:
        			data[HEADERS[idx_col]].append(None)
        		idx_col = idx_col + 1
    # SHOW THE RECOVERED DATA
    #print(data)
    df_table = pd.DataFrame.from_dict(data)
    df_table.to_excel(name_excel, index=False)
    
def upload_file(filepath):
    name = Path(filepath).name
    excel_name = Path(filepath).stem + ".xlsx"
    # load pdf
    doc = fitz.open(filepath)
    #create_excel(doc,excel_name)
    # now create the excel file
    return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Descarga {excel_name}", value=excel_name, visible=True)]

def download_file():
    return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]

with gr.Blocks() as demo:
    gr.Markdown("Primero sube tu archivo PDF, luego podrás descargar un archivo Excel (Sube un archivo por vez!)")
    with gr.Row():
        u = gr.UploadButton("Sube tu PDF", file_count="single")
        d = gr.DownloadButton("Descarga tu Excel", visible=False)

    u.upload(upload_file, u, [u, d])
    d.click(download_file, None, [u, d])

if __name__ == "__main__":
    demo.launch()