Spaces:
Sleeping
Sleeping
import streamlit as st | |
import fitz # PyMuPDF | |
import pandas as pd | |
import re | |
import pdfplumber | |
from html_templates import Extracter_title, tooltip_message_extracter, logo, button_styles_fynder, upload_extracter_file, download_button_styles, extractor_placeholder | |
st.markdown(logo, unsafe_allow_html=True) | |
st.logo("alerter_4.jpeg") | |
def extract_text_from_pdf(uploaded_file): | |
try: | |
doc = fitz.open(stream=uploaded_file.read(), filetype="pdf") | |
text = "\n".join([page.get_text("text") for page in doc]) | |
key_value_data = parse_text_as_table(text) | |
table_data = extract_table_from_pdf(doc) | |
return key_value_data, table_data | |
except Exception as e: | |
return pd.DataFrame({"Error": [f"Error extracting PDF text: {str(e)}"]}), None | |
def parse_text_as_table(text): | |
"""Extracts key-value pairs from structured text.""" | |
data = [] | |
lines = text.split("\n") | |
for line in lines: | |
if " - " in line: | |
key, value = line.split(" - ", 1) | |
data.append([key.strip(), value.strip()]) | |
elif ":" in line: | |
key, value = line.split(":", 1) | |
data.append([key.strip(), value.strip()]) | |
return pd.DataFrame(data, columns=["Field", "Value"]) | |
def extract_table_from_pdf(doc): | |
"""Extracts tabular data using PyMuPDF.""" | |
table_data = [] | |
table_started = False | |
for page in doc: | |
blocks = page.get_text("blocks") # Get structured text blocks | |
blocks = sorted(blocks, key=lambda x: (x[1], x[0])) # Sort by Y, then X | |
for block in blocks: | |
text = block[4].strip() | |
if re.search(r"Item Name|SAC Code|Taxable Value", text, re.IGNORECASE): | |
table_started = True # Start extracting after headers | |
continue # Skip header line | |
if table_started: | |
columns = re.split(r"\s{2,}", text) # Split by large spaces | |
if len(columns) >= 5: # Ensure it's a valid row | |
table_data.append(columns) | |
if table_data: | |
return pd.DataFrame(table_data, columns=["Item Name", "SAC Code", "Gross Amount", "Tax Type", "Tax Amount", "Total Value"]) | |
return None | |
def extract_first_table_row(uploaded_file): | |
"""Extracts only the first row of tabular data and reshapes it.""" | |
with pdfplumber.open(uploaded_file) as pdf: | |
for page in pdf.pages: | |
extracted_table = page.extract_table() | |
if extracted_table: | |
df = pd.DataFrame(extracted_table) | |
df.columns = df.iloc[0] # First row as column headers | |
df = df[1:].reset_index(drop=True) # Remove the first row (headers) | |
if not df.empty: | |
first_row = df.iloc[0] # Get only the first row | |
# Reshape table into Column A (Headers) and Column B (Values) | |
reshaped_data = [[col, first_row[col]] for col in df.columns] | |
return pd.DataFrame(reshaped_data, columns=["Field", "Value"]) | |
return None | |
def main(): | |
st.markdown(Extracter_title, unsafe_allow_html = True) | |
st.write("") | |
st.markdown(tooltip_message_extracter, unsafe_allow_html = True) | |
st.markdown(upload_extracter_file, unsafe_allow_html = True) | |
uploaded_file = st.file_uploader("", type=["pdf"], accept_multiple_files=False) | |
if uploaded_file: | |
extracted_df, table_df = extract_text_from_pdf(uploaded_file) | |
# Extract first row of table data | |
first_row_df = extract_first_table_row(uploaded_file) | |
# Combine key-value data and first row of table data | |
if first_row_df is not None: | |
combined_df = pd.concat([extracted_df, first_row_df], ignore_index=True) | |
else: | |
combined_df = extracted_df # If no table row exists, use only key-value data | |
# Display combined extracted data | |
st.markdown(extractor_placeholder, unsafe_allow_html = True) | |
st.dataframe(combined_df) | |
# Allow user to download combined extracted data | |
csv_combined = combined_df.to_csv(index=False).encode('utf-8') | |
st.markdown(download_button_styles, unsafe_allow_html = True) | |
st.download_button("Download", csv_combined, file_name="combined_extracted_data.csv", mime="text/csv") | |
# Display extracted table data separately | |
if table_df is not None: | |
st.subheader("Extracted Table Data") | |
st.dataframe(table_df) | |
# Allow user to download extracted table data | |
csv_table = table_df.to_csv(index=False).encode('utf-8') | |
st.markdown(download_button_styles, unsafe_allow_html = True) | |
st.download_button(csv_table, file_name="extracted_data.csv", mime="text/csv") | |
if __name__ == "__main__": | |
main() |