neerajkalyank commited on
Commit
ec5938f
1 Parent(s): 0b7731a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -68
app.py CHANGED
@@ -1,71 +1,23 @@
1
  import gradio as gr
2
- import pandas as pd
3
- import pdfplumber
4
- import io
5
- import logging
6
-
7
- # Set up logging for better debugging
8
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9
-
10
- def extract_columns_from_pdf(file, columns):
11
- try:
12
- required_columns = [col.strip() for col in columns.split(",")]
13
- pdf_data = []
14
- extracted_tables = False # Flag to check if tables were found
15
-
16
- with pdfplumber.open(file) as pdf:
17
- for page_num, page in enumerate(pdf.pages):
18
- tables = page.extract_tables()
19
- for table in tables:
20
- if table: # Ensure table exists
21
- df = pd.DataFrame(table[1:], columns=table[0])
22
- logging.info(f"Processing page {page_num + 1} with columns: {df.columns.tolist()}")
23
-
24
- # Check if required columns are present
25
- matching_columns = [col for col in required_columns if col in df.columns]
26
- if len(matching_columns) == len(required_columns):
27
- filtered_df = df[required_columns]
28
- pdf_data.append(filtered_df)
29
- extracted_tables = True
30
-
31
- if not extracted_tables:
32
- logging.error("No tables with specified columns found in the PDF.")
33
- return "Error: No tables with specified columns found in the PDF. Please verify the column names."
34
-
35
- # Combine all filtered tables into one DataFrame
36
- full_data = pd.concat(pdf_data, ignore_index=True)
37
-
38
- # Save to Excel
39
- output = io.BytesIO()
40
- with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
41
- full_data.to_excel(writer, index=False)
42
-
43
- output.seek(0)
44
- logging.info("Successfully extracted the specified columns and saved to Excel.")
45
- return output
46
-
47
- except KeyError as e:
48
- logging.error(f"Column not found: {e}")
49
- return f"Error: Column '{e.args[0]}' not found in the extracted tables."
50
- except Exception as e:
51
- logging.error(f"Unexpected Error: {e}")
52
- return f"Unexpected Error: {str(e)}"
53
-
54
- def main():
55
- # Define Gradio interface
56
- column_input = gr.Textbox(label="Enter Column Names (comma-separated)")
57
- file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
58
- output_file = gr.File(label="Download Extracted Excel File")
59
-
60
- # Create Gradio interface
61
- interface = gr.Interface(
62
- fn=extract_columns_from_pdf,
63
- inputs=[file_input, column_input],
64
- outputs="file", # Specify "file" output type
65
- )
66
-
67
- # Launch app
68
- interface.launch()
69
 
70
  if __name__ == "__main__":
71
- main()
 
1
  import gradio as gr
2
+ from toshiba import extract_toshiba_data
3
+ from bhel import extract_bhel_data
4
+
5
+ def extract_data(pdf_file, company):
6
+ if company == 'Toshiba':
7
+ return extract_toshiba_data(pdf_file)
8
+ elif company == 'BHEL':
9
+ return extract_bhel_data(pdf_file)
10
+ else:
11
+ raise ValueError("Unsupported company format")
12
+
13
+ company_options = ['Toshiba', 'BHEL']
14
+ interface = gr.Interface(
15
+ fn=extract_data,
16
+ inputs=[gr.File(label="Upload PDF"), gr.Dropdown(choices=company_options, label="Select Company")],
17
+ outputs=gr.File(label="Download Extracted Data as Excel"),
18
+ title="PDF Data Extractor for Toshiba and BHEL",
19
+ description="Upload a PDF file and select the company to extract and format data into an Excel file according to specific requirements."
20
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  if __name__ == "__main__":
23
+ interface.launch()