Spaces:

vermen
/

extracttable

Sleeping

App Files Files Community

vermen commited on Aug 16, 2024

Commit

57c12e6

verified ·

1 Parent(s): 37e383b

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -1

app.py CHANGED Viewed

@@ -1,13 +1,61 @@
 from pathlib import Path
 import gradio as gr
 import fitz
 def upload_file(filepath):
     name = Path(filepath).name
     # load pdf
     doc = fitz.open(filepath)
     # now create the excel file
-    return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]
 def download_file():
     return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]

 from pathlib import Path
 import gradio as gr
 import fitz
+import pandas as pd
+def create_excel(doc):
+    HEADERS = ['TR [min]','Nome','Area','Fator Capacidade','Pratos Teóricos','Sinal-ruído (USP)','Resolução','Assimetría','Altura','Pureza']
+    LIMITS = [(17,50),(50,130),(130,184),(184,240),(240,311),(311,360),(330,418),(400,487),(450,533),(500,600)]
+    # LIMITS will be used to correctly identify to which block the data pertains
+    def within_limits(x,idx_limit):
+    	return (x >= LIMITS[idx_limit][0] and x < LIMITS[idx_limit][1])
+    # to create the dataframe
+    data = {val:list() for val in HEADERS}
+    # Extracting text from all pages
+    all_text = []
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        blocks = page.get_text(option = "words")
+        # visit each page
+        idx = 0
+        while (idx < len(blocks)) and (blocks[idx][4] != 'TR'):
+        	idx = idx + 1
+        # check if the next is [min]
+        if (idx + 1 < len(blocks)) and blocks[idx + 1][4] != "[min]":
+        	continue
+        #
+        #print(blocks[idx:(idx+14)])
+        idx = idx + 14
+        while (idx < len(blocks)):
+        	if(blocks[idx][4] == 'Relatório'):
+        		break
+        	idx_col = 0
+        	while (idx_col < len(HEADERS)) and (idx < len(blocks)):
+        		if within_limits(blocks[idx][0],idx_col):
+        			if idx_col == 1:
+        				final_string = "" #blocks[idx][4]
+        				while (idx < len(blocks)) and within_limits(blocks[idx][0],idx_col):
+        					final_string = final_string + " " + blocks[idx][4]
+        					idx = idx + 1
+        				data[HEADERS[idx_col]].append(final_string)
+        				idx = idx - 1
+        			else:
+        				data[HEADERS[idx_col]].append(blocks[idx][4])
+        			idx = idx + 1
+        		else:
+        			data[HEADERS[idx_col]].append(None)
+        		idx_col = idx_col + 1
+    # SHOW THE RECOVERED DATA
+    df_table = pd.DataFrame.from_dict(data)
+    return (df.to_excel("tabla.xlsx", index=False))
 def upload_file(filepath):
     name = Path(filepath).name
     # load pdf
     doc = fitz.open(filepath)
+    df_table = create_excel(doc)
     # now create the excel file
+    return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download tabla.xlsx", value=df_table, visible=True)]
 def download_file():
     return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]