Alerter_v4.0 / pages /Extractor.py
Ninad077's picture
Upload 10 files
d8535a4 verified
import streamlit as st
import fitz # PyMuPDF
import pandas as pd
import re
import pdfplumber
from html_templates import Extracter_title, tooltip_message_extracter, logo, button_styles_fynder, upload_extracter_file, download_button_styles, extractor_placeholder
st.markdown(logo, unsafe_allow_html=True)
st.logo("alerter_4.jpeg")
def extract_text_from_pdf(uploaded_file):
try:
doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
text = "\n".join([page.get_text("text") for page in doc])
key_value_data = parse_text_as_table(text)
table_data = extract_table_from_pdf(doc)
return key_value_data, table_data
except Exception as e:
return pd.DataFrame({"Error": [f"Error extracting PDF text: {str(e)}"]}), None
def parse_text_as_table(text):
"""Extracts key-value pairs from structured text."""
data = []
lines = text.split("\n")
for line in lines:
if " - " in line:
key, value = line.split(" - ", 1)
data.append([key.strip(), value.strip()])
elif ":" in line:
key, value = line.split(":", 1)
data.append([key.strip(), value.strip()])
return pd.DataFrame(data, columns=["Field", "Value"])
def extract_table_from_pdf(doc):
"""Extracts tabular data using PyMuPDF."""
table_data = []
table_started = False
for page in doc:
blocks = page.get_text("blocks") # Get structured text blocks
blocks = sorted(blocks, key=lambda x: (x[1], x[0])) # Sort by Y, then X
for block in blocks:
text = block[4].strip()
if re.search(r"Item Name|SAC Code|Taxable Value", text, re.IGNORECASE):
table_started = True # Start extracting after headers
continue # Skip header line
if table_started:
columns = re.split(r"\s{2,}", text) # Split by large spaces
if len(columns) >= 5: # Ensure it's a valid row
table_data.append(columns)
if table_data:
return pd.DataFrame(table_data, columns=["Item Name", "SAC Code", "Gross Amount", "Tax Type", "Tax Amount", "Total Value"])
return None
def extract_first_table_row(uploaded_file):
"""Extracts only the first row of tabular data and reshapes it."""
with pdfplumber.open(uploaded_file) as pdf:
for page in pdf.pages:
extracted_table = page.extract_table()
if extracted_table:
df = pd.DataFrame(extracted_table)
df.columns = df.iloc[0] # First row as column headers
df = df[1:].reset_index(drop=True) # Remove the first row (headers)
if not df.empty:
first_row = df.iloc[0] # Get only the first row
# Reshape table into Column A (Headers) and Column B (Values)
reshaped_data = [[col, first_row[col]] for col in df.columns]
return pd.DataFrame(reshaped_data, columns=["Field", "Value"])
return None
def main():
st.markdown(Extracter_title, unsafe_allow_html = True)
st.write("")
st.markdown(tooltip_message_extracter, unsafe_allow_html = True)
st.markdown(upload_extracter_file, unsafe_allow_html = True)
uploaded_file = st.file_uploader("", type=["pdf"], accept_multiple_files=False)
if uploaded_file:
extracted_df, table_df = extract_text_from_pdf(uploaded_file)
# Extract first row of table data
first_row_df = extract_first_table_row(uploaded_file)
# Combine key-value data and first row of table data
if first_row_df is not None:
combined_df = pd.concat([extracted_df, first_row_df], ignore_index=True)
else:
combined_df = extracted_df # If no table row exists, use only key-value data
# Display combined extracted data
st.markdown(extractor_placeholder, unsafe_allow_html = True)
st.dataframe(combined_df)
# Allow user to download combined extracted data
csv_combined = combined_df.to_csv(index=False).encode('utf-8')
st.markdown(download_button_styles, unsafe_allow_html = True)
st.download_button("Download", csv_combined, file_name="combined_extracted_data.csv", mime="text/csv")
# Display extracted table data separately
if table_df is not None:
st.subheader("Extracted Table Data")
st.dataframe(table_df)
# Allow user to download extracted table data
csv_table = table_df.to_csv(index=False).encode('utf-8')
st.markdown(download_button_styles, unsafe_allow_html = True)
st.download_button(csv_table, file_name="extracted_data.csv", mime="text/csv")
if __name__ == "__main__":
main()