Spaces:
Sleeping
Sleeping
| import pdfplumber | |
| import pandas as pd | |
| import re | |
| import gradio as gr | |
| def extract_data(pdf_file): | |
| """ | |
| Extract data from the uploaded PDF for dynamic ranges (e.g., 10 to n). | |
| """ | |
| data = [] | |
| columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"] | |
| start_si = 10 # Start from SI No 10 | |
| end_si = None # Dynamically detect the end SI No | |
| with pdfplumber.open(pdf_file) as pdf: | |
| for page in pdf.pages: | |
| full_text = page.extract_text() # Get the text content for the page | |
| lines = full_text.splitlines() if full_text else [] | |
| for line in lines: | |
| try: | |
| # Parse the first column for SI No | |
| si_no_match = re.match(r"^\s*(\d+)\s", line) | |
| if not si_no_match: | |
| continue | |
| si_no = int(si_no_match.group(1)) | |
| # Dynamically set the end SI No if higher SI Nos are found | |
| if end_si is None or si_no > end_si: | |
| end_si = si_no | |
| if si_no < start_si: | |
| continue # Skip rows below the start SI No | |
| # Extract Material Description and details dynamically | |
| material_desc = extract_material_description(full_text, si_no) | |
| # Extract remaining fields | |
| parts = line.split() | |
| unit = parts[3] | |
| quantity = int(parts[4]) | |
| dely_qty = int(parts[5]) | |
| dely_date = parts[6] | |
| unit_rate = float(parts[7]) | |
| value = float(parts[8]) | |
| # Append row data | |
| data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value]) | |
| except (ValueError, IndexError): | |
| # Skip invalid rows or rows with missing data | |
| continue | |
| # Convert data to DataFrame and save as Excel | |
| df = pd.DataFrame(data, columns=columns) | |
| excel_path = "/tmp/Extracted_PO_Data_Dynamic.xlsx" | |
| df.to_excel(excel_path, index=False) | |
| return excel_path | |
| def extract_material_description(full_text, si_no): | |
| """ | |
| Extract Material Description, including Material Number, HSN Code, and IGST, using unique patterns. | |
| """ | |
| material_desc = "" | |
| # Match the specific SI No row to extract details | |
| si_no_pattern = rf"{si_no}\s+(BPS\s+\d+).*?Material\s+Number:\s+(\d+)" | |
| match = re.search(si_no_pattern, full_text, re.DOTALL) | |
| if match: | |
| bps_code = match.group(1) | |
| material_number = match.group(2) | |
| material_desc += f"{bps_code}\nMaterial Number: {material_number}\n" | |
| # Extract HSN Code | |
| hsn_code_match = re.search(r"HSN\s+Code:\s*(\d+)", full_text) | |
| if hsn_code_match: | |
| hsn_code = hsn_code_match.group(1) | |
| material_desc += f"HSN Code: {hsn_code}\n" | |
| else: | |
| material_desc += "HSN Code: Not Found\n" | |
| # Extract IGST | |
| igst_match = re.search(r"IGST\s*:\s*(\d+)\s*%", full_text) | |
| if igst_match: | |
| igst = igst_match.group(1) | |
| material_desc += f"IGST: {igst} %" | |
| else: | |
| material_desc += "IGST: Not Found" | |
| return material_desc.strip() | |
| def process_pdf(file): | |
| """ | |
| Process the uploaded PDF and return the extracted data. | |
| """ | |
| try: | |
| # Extract text from the PDF | |
| output_path = extract_data(file.name) | |
| return output_path, "Data extraction successful!" | |
| except Exception as e: | |
| return None, f"Error during processing: {str(e)}" | |
| # Gradio Interface | |
| def gradio_interface(pdf_file): | |
| """ | |
| Interface function for Gradio to process the PDF and return the Excel file. | |
| """ | |
| return extract_data(pdf_file.name) | |
| # Define Gradio interface | |
| interface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=gr.File(label="Upload PDF"), | |
| outputs=gr.File(label="Download Extracted Excel"), | |
| title="Dynamic BHEL PO Data Extractor", | |
| description="Upload a PDF to extract accurate Material Numbers and related data dynamically into an Excel file." | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |