Spaces:
Sleeping
Sleeping
from pathlib import Path | |
import gradio as gr | |
import fitz | |
import pandas as pd | |
def create_excel(doc,name_excel): | |
HEADERS = ['TR [min]','Nome','Area','Fator Capacidade','Pratos Teóricos','Sinal-ruído (USP)','Resolução','Assimetría','Altura','Pureza'] | |
LIMITS = [(17,50),(50,130),(130,184),(184,240),(240,311),(311,360),(330,418),(400,487),(450,533),(500,600)] | |
# LIMITS will be used to correctly identify to which block the data pertains | |
def within_limits(x,idx_limit): | |
return (x >= LIMITS[idx_limit][0] and x < LIMITS[idx_limit][1]) | |
# to create the dataframe | |
data = {val:list() for val in HEADERS} | |
# Extracting text from all pages | |
all_text = [] | |
for page_num in range(len(doc)): | |
page = doc[page_num] | |
blocks = page.get_text(option = "words") | |
# visit each page | |
idx = 0 | |
while (idx < len(blocks)) and (blocks[idx][4] != 'TR'): | |
idx = idx + 1 | |
# check if the next is [min] | |
if (idx + 1 < len(blocks)) and blocks[idx + 1][4] != "[min]": | |
continue | |
# | |
#print(blocks[idx:(idx+14)]) | |
idx = idx + 14 | |
while (idx < len(blocks)): | |
if(blocks[idx][4] == 'Relatório'): | |
break | |
idx_col = 0 | |
while (idx_col < len(HEADERS)) and (idx < len(blocks)): | |
if within_limits(blocks[idx][0],idx_col): | |
if idx_col == 1: | |
final_string = "" #blocks[idx][4] | |
while (idx < len(blocks)) and within_limits(blocks[idx][0],idx_col): | |
final_string = final_string + " " + blocks[idx][4] | |
idx = idx + 1 | |
data[HEADERS[idx_col]].append(final_string) | |
idx = idx - 1 | |
else: | |
data[HEADERS[idx_col]].append(blocks[idx][4]) | |
idx = idx + 1 | |
else: | |
data[HEADERS[idx_col]].append(None) | |
idx_col = idx_col + 1 | |
# SHOW THE RECOVERED DATA | |
#print(data) | |
df_table = pd.DataFrame.from_dict(data) | |
df_table.to_excel(name_excel, index=False) | |
def upload_file(filepath): | |
name = Path(filepath).name | |
excel_name = Path(filepath).stem + ".xlsx" | |
# load pdf | |
doc = fitz.open(filepath) | |
#create_excel(doc,excel_name) | |
# now create the excel file | |
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Descarga {excel_name}", value=excel_name, visible=True)] | |
def download_file(): | |
return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)] | |
with gr.Blocks() as demo: | |
gr.Markdown("Primero sube tu archivo PDF, luego podrás descargar un archivo Excel (Sube un archivo por vez!)") | |
with gr.Row(): | |
u = gr.UploadButton("Sube tu PDF", file_count="single") | |
d = gr.DownloadButton("Descarga tu Excel", visible=False) | |
u.upload(upload_file, u, [u, d]) | |
d.click(download_file, None, [u, d]) | |
if __name__ == "__main__": | |
demo.launch() |