Spaces:

neerajkalyank
/

pdf-to-excel

Build error

App Files Files Community

neerajkalyank commited on Nov 8, 2024

Commit

ec5938f

•

1 Parent(s): 0b7731a

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -68

app.py CHANGED Viewed

@@ -1,71 +1,23 @@
 import gradio as gr
-import pandas as pd
-import pdfplumber
-import io
-import logging
-# Set up logging for better debugging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-def extract_columns_from_pdf(file, columns):
-    try:
-        required_columns = [col.strip() for col in columns.split(",")]
-        pdf_data = []
-        extracted_tables = False  # Flag to check if tables were found
-        with pdfplumber.open(file) as pdf:
-            for page_num, page in enumerate(pdf.pages):
-                tables = page.extract_tables()
-                for table in tables:
-                    if table:  # Ensure table exists
-                        df = pd.DataFrame(table[1:], columns=table[0])
-                        logging.info(f"Processing page {page_num + 1} with columns: {df.columns.tolist()}")
-                        # Check if required columns are present
-                        matching_columns = [col for col in required_columns if col in df.columns]
-                        if len(matching_columns) == len(required_columns):
-                            filtered_df = df[required_columns]
-                            pdf_data.append(filtered_df)
-                            extracted_tables = True
-        if not extracted_tables:
-            logging.error("No tables with specified columns found in the PDF.")
-            return "Error: No tables with specified columns found in the PDF. Please verify the column names."
-        # Combine all filtered tables into one DataFrame
-        full_data = pd.concat(pdf_data, ignore_index=True)
-        # Save to Excel
-        output = io.BytesIO()
-        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
-            full_data.to_excel(writer, index=False)
-        output.seek(0)
-        logging.info("Successfully extracted the specified columns and saved to Excel.")
-        return output
-    except KeyError as e:
-        logging.error(f"Column not found: {e}")
-        return f"Error: Column '{e.args[0]}' not found in the extracted tables."
-    except Exception as e:
-        logging.error(f"Unexpected Error: {e}")
-        return f"Unexpected Error: {str(e)}"
-def main():
-    # Define Gradio interface
-    column_input = gr.Textbox(label="Enter Column Names (comma-separated)")
-    file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-    output_file = gr.File(label="Download Extracted Excel File")
-    # Create Gradio interface
-    interface = gr.Interface(
-        fn=extract_columns_from_pdf,
-        inputs=[file_input, column_input],
-        outputs="file",  # Specify "file" output type
-    )
-    # Launch app
-    interface.launch()
 if __name__ == "__main__":
-    main()

 import gradio as gr
+from toshiba import extract_toshiba_data
+from bhel import extract_bhel_data
+def extract_data(pdf_file, company):
+    if company == 'Toshiba':
+        return extract_toshiba_data(pdf_file)
+    elif company == 'BHEL':
+        return extract_bhel_data(pdf_file)
+    else:
+        raise ValueError("Unsupported company format")
+company_options = ['Toshiba', 'BHEL']
+interface = gr.Interface(
+    fn=extract_data,
+    inputs=[gr.File(label="Upload PDF"), gr.Dropdown(choices=company_options, label="Select Company")],
+    outputs=gr.File(label="Download Extracted Data as Excel"),
+    title="PDF Data Extractor for Toshiba and BHEL",
+    description="Upload a PDF file and select the company to extract and format data into an Excel file according to specific requirements."
+)
 if __name__ == "__main__":
+    interface.launch()