Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,61 @@
|
|
1 |
from pathlib import Path
|
2 |
import gradio as gr
|
3 |
import fitz
|
|
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
def upload_file(filepath):
|
6 |
name = Path(filepath).name
|
7 |
# load pdf
|
8 |
doc = fitz.open(filepath)
|
|
|
9 |
# now create the excel file
|
10 |
-
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download
|
11 |
|
12 |
def download_file():
|
13 |
return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
|
|
|
1 |
from pathlib import Path
|
2 |
import gradio as gr
|
3 |
import fitz
|
4 |
+
import pandas as pd
|
5 |
|
6 |
+
def create_excel(doc):
|
7 |
+
HEADERS = ['TR [min]','Nome','Area','Fator Capacidade','Pratos Te贸ricos','Sinal-ru铆do (USP)','Resolu莽茫o','Assimetr铆a','Altura','Pureza']
|
8 |
+
LIMITS = [(17,50),(50,130),(130,184),(184,240),(240,311),(311,360),(330,418),(400,487),(450,533),(500,600)]
|
9 |
+
# LIMITS will be used to correctly identify to which block the data pertains
|
10 |
+
def within_limits(x,idx_limit):
|
11 |
+
return (x >= LIMITS[idx_limit][0] and x < LIMITS[idx_limit][1])
|
12 |
+
# to create the dataframe
|
13 |
+
data = {val:list() for val in HEADERS}
|
14 |
+
# Extracting text from all pages
|
15 |
+
all_text = []
|
16 |
+
for page_num in range(len(doc)):
|
17 |
+
page = doc[page_num]
|
18 |
+
blocks = page.get_text(option = "words")
|
19 |
+
# visit each page
|
20 |
+
idx = 0
|
21 |
+
while (idx < len(blocks)) and (blocks[idx][4] != 'TR'):
|
22 |
+
idx = idx + 1
|
23 |
+
# check if the next is [min]
|
24 |
+
if (idx + 1 < len(blocks)) and blocks[idx + 1][4] != "[min]":
|
25 |
+
continue
|
26 |
+
#
|
27 |
+
#print(blocks[idx:(idx+14)])
|
28 |
+
idx = idx + 14
|
29 |
+
while (idx < len(blocks)):
|
30 |
+
if(blocks[idx][4] == 'Relat贸rio'):
|
31 |
+
break
|
32 |
+
idx_col = 0
|
33 |
+
while (idx_col < len(HEADERS)) and (idx < len(blocks)):
|
34 |
+
if within_limits(blocks[idx][0],idx_col):
|
35 |
+
if idx_col == 1:
|
36 |
+
final_string = "" #blocks[idx][4]
|
37 |
+
while (idx < len(blocks)) and within_limits(blocks[idx][0],idx_col):
|
38 |
+
final_string = final_string + " " + blocks[idx][4]
|
39 |
+
idx = idx + 1
|
40 |
+
data[HEADERS[idx_col]].append(final_string)
|
41 |
+
idx = idx - 1
|
42 |
+
else:
|
43 |
+
data[HEADERS[idx_col]].append(blocks[idx][4])
|
44 |
+
idx = idx + 1
|
45 |
+
else:
|
46 |
+
data[HEADERS[idx_col]].append(None)
|
47 |
+
idx_col = idx_col + 1
|
48 |
+
# SHOW THE RECOVERED DATA
|
49 |
+
df_table = pd.DataFrame.from_dict(data)
|
50 |
+
return (df.to_excel("tabla.xlsx", index=False))
|
51 |
+
|
52 |
def upload_file(filepath):
|
53 |
name = Path(filepath).name
|
54 |
# load pdf
|
55 |
doc = fitz.open(filepath)
|
56 |
+
df_table = create_excel(doc)
|
57 |
# now create the excel file
|
58 |
+
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download tabla.xlsx", value=df_table, visible=True)]
|
59 |
|
60 |
def download_file():
|
61 |
return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
|