Spaces:

Ninad077
/

Alerter_v4.0

Sleeping

App Files Files Community

Alerter_v4.0 / pages /Extractor.py

Ninad077

Upload 10 files

d8535a4 verified 3 months ago

raw

history blame contribute delete

4.82 kB

	import streamlit as st
	import fitz # PyMuPDF
	import pandas as pd
	import re
	import pdfplumber
	from html_templates import Extracter_title, tooltip_message_extracter, logo, button_styles_fynder, upload_extracter_file, download_button_styles, extractor_placeholder



	st.markdown(logo, unsafe_allow_html=True)
	st.logo("alerter_4.jpeg")

	def extract_text_from_pdf(uploaded_file):
	try:
	doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
	text = "\n".join([page.get_text("text") for page in doc])

	key_value_data = parse_text_as_table(text)
	table_data = extract_table_from_pdf(doc)

	return key_value_data, table_data
	except Exception as e:
	return pd.DataFrame({"Error": [f"Error extracting PDF text: {str(e)}"]}), None

	def parse_text_as_table(text):
	"""Extracts key-value pairs from structured text."""
	data = []
	lines = text.split("\n")
	for line in lines:
	if " - " in line:
	key, value = line.split(" - ", 1)
	data.append([key.strip(), value.strip()])
	elif ":" in line:
	key, value = line.split(":", 1)
	data.append([key.strip(), value.strip()])
	return pd.DataFrame(data, columns=["Field", "Value"])

	def extract_table_from_pdf(doc):
	"""Extracts tabular data using PyMuPDF."""
	table_data = []
	table_started = False

	for page in doc:
	blocks = page.get_text("blocks") # Get structured text blocks
	blocks = sorted(blocks, key=lambda x: (x[1], x[0])) # Sort by Y, then X

	for block in blocks:
	text = block[4].strip()
	if re.search(r"Item Name\|SAC Code\|Taxable Value", text, re.IGNORECASE):
	table_started = True # Start extracting after headers
	continue # Skip header line

	if table_started:
	columns = re.split(r"\s{2,}", text) # Split by large spaces
	if len(columns) >= 5: # Ensure it's a valid row
	table_data.append(columns)

	if table_data:
	return pd.DataFrame(table_data, columns=["Item Name", "SAC Code", "Gross Amount", "Tax Type", "Tax Amount", "Total Value"])
	return None


	def extract_first_table_row(uploaded_file):
	"""Extracts only the first row of tabular data and reshapes it."""
	with pdfplumber.open(uploaded_file) as pdf:
	for page in pdf.pages:
	extracted_table = page.extract_table()
	if extracted_table:
	df = pd.DataFrame(extracted_table)
	df.columns = df.iloc[0] # First row as column headers
	df = df[1:].reset_index(drop=True) # Remove the first row (headers)

	if not df.empty:
	first_row = df.iloc[0] # Get only the first row

	# Reshape table into Column A (Headers) and Column B (Values)
	reshaped_data = [[col, first_row[col]] for col in df.columns]
	return pd.DataFrame(reshaped_data, columns=["Field", "Value"])

	return None

	def main():
	st.markdown(Extracter_title, unsafe_allow_html = True)
	st.write("")
	st.markdown(tooltip_message_extracter, unsafe_allow_html = True)

	st.markdown(upload_extracter_file, unsafe_allow_html = True)
	uploaded_file = st.file_uploader("", type=["pdf"], accept_multiple_files=False)

	if uploaded_file:
	extracted_df, table_df = extract_text_from_pdf(uploaded_file)

	# Extract first row of table data
	first_row_df = extract_first_table_row(uploaded_file)

	# Combine key-value data and first row of table data
	if first_row_df is not None:
	combined_df = pd.concat([extracted_df, first_row_df], ignore_index=True)
	else:
	combined_df = extracted_df # If no table row exists, use only key-value data

	# Display combined extracted data
	st.markdown(extractor_placeholder, unsafe_allow_html = True)
	st.dataframe(combined_df)

	# Allow user to download combined extracted data
	csv_combined = combined_df.to_csv(index=False).encode('utf-8')
	st.markdown(download_button_styles, unsafe_allow_html = True)
	st.download_button("Download", csv_combined, file_name="combined_extracted_data.csv", mime="text/csv")

	# Display extracted table data separately
	if table_df is not None:
	st.subheader("Extracted Table Data")
	st.dataframe(table_df)

	# Allow user to download extracted table data
	csv_table = table_df.to_csv(index=False).encode('utf-8')

	st.markdown(download_button_styles, unsafe_allow_html = True)
	st.download_button(csv_table, file_name="extracted_data.csv", mime="text/csv")

	if __name__ == "__main__":
	main()