vermen commited on
Commit
57c12e6
verified
1 Parent(s): 37e383b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -1
app.py CHANGED
@@ -1,13 +1,61 @@
1
  from pathlib import Path
2
  import gradio as gr
3
  import fitz
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  def upload_file(filepath):
6
  name = Path(filepath).name
7
  # load pdf
8
  doc = fitz.open(filepath)
 
9
  # now create the excel file
10
- return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]
11
 
12
  def download_file():
13
  return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
 
1
  from pathlib import Path
2
  import gradio as gr
3
  import fitz
4
+ import pandas as pd
5
 
6
+ def create_excel(doc):
7
+ HEADERS = ['TR [min]','Nome','Area','Fator Capacidade','Pratos Te贸ricos','Sinal-ru铆do (USP)','Resolu莽茫o','Assimetr铆a','Altura','Pureza']
8
+ LIMITS = [(17,50),(50,130),(130,184),(184,240),(240,311),(311,360),(330,418),(400,487),(450,533),(500,600)]
9
+ # LIMITS will be used to correctly identify to which block the data pertains
10
+ def within_limits(x,idx_limit):
11
+ return (x >= LIMITS[idx_limit][0] and x < LIMITS[idx_limit][1])
12
+ # to create the dataframe
13
+ data = {val:list() for val in HEADERS}
14
+ # Extracting text from all pages
15
+ all_text = []
16
+ for page_num in range(len(doc)):
17
+ page = doc[page_num]
18
+ blocks = page.get_text(option = "words")
19
+ # visit each page
20
+ idx = 0
21
+ while (idx < len(blocks)) and (blocks[idx][4] != 'TR'):
22
+ idx = idx + 1
23
+ # check if the next is [min]
24
+ if (idx + 1 < len(blocks)) and blocks[idx + 1][4] != "[min]":
25
+ continue
26
+ #
27
+ #print(blocks[idx:(idx+14)])
28
+ idx = idx + 14
29
+ while (idx < len(blocks)):
30
+ if(blocks[idx][4] == 'Relat贸rio'):
31
+ break
32
+ idx_col = 0
33
+ while (idx_col < len(HEADERS)) and (idx < len(blocks)):
34
+ if within_limits(blocks[idx][0],idx_col):
35
+ if idx_col == 1:
36
+ final_string = "" #blocks[idx][4]
37
+ while (idx < len(blocks)) and within_limits(blocks[idx][0],idx_col):
38
+ final_string = final_string + " " + blocks[idx][4]
39
+ idx = idx + 1
40
+ data[HEADERS[idx_col]].append(final_string)
41
+ idx = idx - 1
42
+ else:
43
+ data[HEADERS[idx_col]].append(blocks[idx][4])
44
+ idx = idx + 1
45
+ else:
46
+ data[HEADERS[idx_col]].append(None)
47
+ idx_col = idx_col + 1
48
+ # SHOW THE RECOVERED DATA
49
+ df_table = pd.DataFrame.from_dict(data)
50
+ return (df.to_excel("tabla.xlsx", index=False))
51
+
52
  def upload_file(filepath):
53
  name = Path(filepath).name
54
  # load pdf
55
  doc = fitz.open(filepath)
56
+ df_table = create_excel(doc)
57
  # now create the excel file
58
+ return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download tabla.xlsx", value=df_table, visible=True)]
59
 
60
  def download_file():
61
  return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]