import streamlit as st import fitz # PyMuPDF import pandas as pd import re import pdfplumber from html_templates import Extracter_title, tooltip_message_extracter, logo, button_styles_fynder, upload_extracter_file, download_button_styles, extractor_placeholder st.markdown(logo, unsafe_allow_html=True) st.logo("alerter_4.jpeg") def extract_text_from_pdf(uploaded_file): try: doc = fitz.open(stream=uploaded_file.read(), filetype="pdf") text = "\n".join([page.get_text("text") for page in doc]) key_value_data = parse_text_as_table(text) table_data = extract_table_from_pdf(doc) return key_value_data, table_data except Exception as e: return pd.DataFrame({"Error": [f"Error extracting PDF text: {str(e)}"]}), None def parse_text_as_table(text): """Extracts key-value pairs from structured text.""" data = [] lines = text.split("\n") for line in lines: if " - " in line: key, value = line.split(" - ", 1) data.append([key.strip(), value.strip()]) elif ":" in line: key, value = line.split(":", 1) data.append([key.strip(), value.strip()]) return pd.DataFrame(data, columns=["Field", "Value"]) def extract_table_from_pdf(doc): """Extracts tabular data using PyMuPDF.""" table_data = [] table_started = False for page in doc: blocks = page.get_text("blocks") # Get structured text blocks blocks = sorted(blocks, key=lambda x: (x[1], x[0])) # Sort by Y, then X for block in blocks: text = block[4].strip() if re.search(r"Item Name|SAC Code|Taxable Value", text, re.IGNORECASE): table_started = True # Start extracting after headers continue # Skip header line if table_started: columns = re.split(r"\s{2,}", text) # Split by large spaces if len(columns) >= 5: # Ensure it's a valid row table_data.append(columns) if table_data: return pd.DataFrame(table_data, columns=["Item Name", "SAC Code", "Gross Amount", "Tax Type", "Tax Amount", "Total Value"]) return None def extract_first_table_row(uploaded_file): """Extracts only the first row of tabular data and reshapes it.""" with pdfplumber.open(uploaded_file) as pdf: for page in pdf.pages: extracted_table = page.extract_table() if extracted_table: df = pd.DataFrame(extracted_table) df.columns = df.iloc[0] # First row as column headers df = df[1:].reset_index(drop=True) # Remove the first row (headers) if not df.empty: first_row = df.iloc[0] # Get only the first row # Reshape table into Column A (Headers) and Column B (Values) reshaped_data = [[col, first_row[col]] for col in df.columns] return pd.DataFrame(reshaped_data, columns=["Field", "Value"]) return None def main(): st.markdown(Extracter_title, unsafe_allow_html = True) st.write("") st.markdown(tooltip_message_extracter, unsafe_allow_html = True) st.markdown(upload_extracter_file, unsafe_allow_html = True) uploaded_file = st.file_uploader("", type=["pdf"], accept_multiple_files=False) if uploaded_file: extracted_df, table_df = extract_text_from_pdf(uploaded_file) # Extract first row of table data first_row_df = extract_first_table_row(uploaded_file) # Combine key-value data and first row of table data if first_row_df is not None: combined_df = pd.concat([extracted_df, first_row_df], ignore_index=True) else: combined_df = extracted_df # If no table row exists, use only key-value data # Display combined extracted data st.markdown(extractor_placeholder, unsafe_allow_html = True) st.dataframe(combined_df) # Allow user to download combined extracted data csv_combined = combined_df.to_csv(index=False).encode('utf-8') st.markdown(download_button_styles, unsafe_allow_html = True) st.download_button("Download", csv_combined, file_name="combined_extracted_data.csv", mime="text/csv") # Display extracted table data separately if table_df is not None: st.subheader("Extracted Table Data") st.dataframe(table_df) # Allow user to download extracted table data csv_table = table_df.to_csv(index=False).encode('utf-8') st.markdown(download_button_styles, unsafe_allow_html = True) st.download_button(csv_table, file_name="extracted_data.csv", mime="text/csv") if __name__ == "__main__": main()