extracttable / app.py
vermen's picture
Update app.py
c4bc05d verified
from pathlib import Path
import gradio as gr
import fitz
import pandas as pd
def create_excel(doc,name_excel):
HEADERS = ['TR [min]','Nome','Area','Fator Capacidade','Pratos Teóricos','Sinal-ruído (USP)','Resolução','Assimetría','Altura','Pureza']
LIMITS = [(17,50),(50,130),(130,184),(184,240),(240,311),(311,360),(330,418),(400,487),(450,533),(500,600)]
# LIMITS will be used to correctly identify to which block the data pertains
def within_limits(x,idx_limit):
return (x >= LIMITS[idx_limit][0] and x < LIMITS[idx_limit][1])
# to create the dataframe
data = {val:list() for val in HEADERS}
# Extracting text from all pages
all_text = []
for page_num in range(len(doc)):
page = doc[page_num]
blocks = page.get_text(option = "words")
# visit each page
idx = 0
while (idx < len(blocks)) and (blocks[idx][4] != 'TR'):
idx = idx + 1
# check if the next is [min]
if (idx + 1 < len(blocks)) and blocks[idx + 1][4] != "[min]":
continue
#
#print(blocks[idx:(idx+14)])
idx = idx + 14
while (idx < len(blocks)):
if(blocks[idx][4] == 'Relatório'):
break
idx_col = 0
while (idx_col < len(HEADERS)) and (idx < len(blocks)):
if within_limits(blocks[idx][0],idx_col):
if idx_col == 1:
final_string = "" #blocks[idx][4]
while (idx < len(blocks)) and within_limits(blocks[idx][0],idx_col):
final_string = final_string + " " + blocks[idx][4]
idx = idx + 1
data[HEADERS[idx_col]].append(final_string)
idx = idx - 1
else:
data[HEADERS[idx_col]].append(blocks[idx][4])
idx = idx + 1
else:
data[HEADERS[idx_col]].append(None)
idx_col = idx_col + 1
# SHOW THE RECOVERED DATA
#print(data)
df_table = pd.DataFrame.from_dict(data)
df_table.to_excel(name_excel, index=False)
def upload_file(filepath):
name = Path(filepath).name
excel_name = Path(filepath).stem + ".xlsx"
# load pdf
doc = fitz.open(filepath)
#create_excel(doc,excel_name)
# now create the excel file
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Descarga {excel_name}", value=excel_name, visible=True)]
def download_file():
return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
with gr.Blocks() as demo:
gr.Markdown("Primero sube tu archivo PDF, luego podrás descargar un archivo Excel (Sube un archivo por vez!)")
with gr.Row():
u = gr.UploadButton("Sube tu PDF", file_count="single")
d = gr.DownloadButton("Descarga tu Excel", visible=False)
u.upload(upload_file, u, [u, d])
d.click(download_file, None, [u, d])
if __name__ == "__main__":
demo.launch()