Spaces:
Build error
Build error
neerajkalyank
commited on
Commit
•
ec5938f
1
Parent(s):
0b7731a
Update app.py
Browse files
app.py
CHANGED
@@ -1,71 +1,23 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
-
import
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
df = pd.DataFrame(table[1:], columns=table[0])
|
22 |
-
logging.info(f"Processing page {page_num + 1} with columns: {df.columns.tolist()}")
|
23 |
-
|
24 |
-
# Check if required columns are present
|
25 |
-
matching_columns = [col for col in required_columns if col in df.columns]
|
26 |
-
if len(matching_columns) == len(required_columns):
|
27 |
-
filtered_df = df[required_columns]
|
28 |
-
pdf_data.append(filtered_df)
|
29 |
-
extracted_tables = True
|
30 |
-
|
31 |
-
if not extracted_tables:
|
32 |
-
logging.error("No tables with specified columns found in the PDF.")
|
33 |
-
return "Error: No tables with specified columns found in the PDF. Please verify the column names."
|
34 |
-
|
35 |
-
# Combine all filtered tables into one DataFrame
|
36 |
-
full_data = pd.concat(pdf_data, ignore_index=True)
|
37 |
-
|
38 |
-
# Save to Excel
|
39 |
-
output = io.BytesIO()
|
40 |
-
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
|
41 |
-
full_data.to_excel(writer, index=False)
|
42 |
-
|
43 |
-
output.seek(0)
|
44 |
-
logging.info("Successfully extracted the specified columns and saved to Excel.")
|
45 |
-
return output
|
46 |
-
|
47 |
-
except KeyError as e:
|
48 |
-
logging.error(f"Column not found: {e}")
|
49 |
-
return f"Error: Column '{e.args[0]}' not found in the extracted tables."
|
50 |
-
except Exception as e:
|
51 |
-
logging.error(f"Unexpected Error: {e}")
|
52 |
-
return f"Unexpected Error: {str(e)}"
|
53 |
-
|
54 |
-
def main():
|
55 |
-
# Define Gradio interface
|
56 |
-
column_input = gr.Textbox(label="Enter Column Names (comma-separated)")
|
57 |
-
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
58 |
-
output_file = gr.File(label="Download Extracted Excel File")
|
59 |
-
|
60 |
-
# Create Gradio interface
|
61 |
-
interface = gr.Interface(
|
62 |
-
fn=extract_columns_from_pdf,
|
63 |
-
inputs=[file_input, column_input],
|
64 |
-
outputs="file", # Specify "file" output type
|
65 |
-
)
|
66 |
-
|
67 |
-
# Launch app
|
68 |
-
interface.launch()
|
69 |
|
70 |
if __name__ == "__main__":
|
71 |
-
|
|
|
1 |
import gradio as gr
|
2 |
+
from toshiba import extract_toshiba_data
|
3 |
+
from bhel import extract_bhel_data
|
4 |
+
|
5 |
+
def extract_data(pdf_file, company):
|
6 |
+
if company == 'Toshiba':
|
7 |
+
return extract_toshiba_data(pdf_file)
|
8 |
+
elif company == 'BHEL':
|
9 |
+
return extract_bhel_data(pdf_file)
|
10 |
+
else:
|
11 |
+
raise ValueError("Unsupported company format")
|
12 |
+
|
13 |
+
company_options = ['Toshiba', 'BHEL']
|
14 |
+
interface = gr.Interface(
|
15 |
+
fn=extract_data,
|
16 |
+
inputs=[gr.File(label="Upload PDF"), gr.Dropdown(choices=company_options, label="Select Company")],
|
17 |
+
outputs=gr.File(label="Download Extracted Data as Excel"),
|
18 |
+
title="PDF Data Extractor for Toshiba and BHEL",
|
19 |
+
description="Upload a PDF file and select the company to extract and format data into an Excel file according to specific requirements."
|
20 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
if __name__ == "__main__":
|
23 |
+
interface.launch()
|