Spaces:

el-camino-de-santiago
/

model-pick

Sleeping

App Files Files Community

model-pick / app.py

anmolsahai

bug12

28cc4a3 5 months ago

raw

history blame

4.58 kB

	import streamlit as st
	from langchain_pipeline import pipeline, model_names
	import fitz # PyMuPDF
	from docx import Document
	from difflib import unified_diff
	import tempfile
	from docx.shared import RGBColor
	import re

	def pdf_to_text_with_layout(pdf_file):
	doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
	text = []
	for page_num in range(doc.page_count):
	page = doc.load_page(page_num)
	text.append(page.get_text("text"))
	return "\n".join(text)

	def clean_text(text):
	# Remove non-ASCII and control characters
	return ''.join(c for c in text if c.isprintable() and ord(c) < 65536)

	def text_to_word_with_formatting(text, word_path):
	doc = Document()
	for line in text.split("\n"):
	clean_line = clean_text(line)
	doc.add_paragraph(clean_line)
	doc.save(word_path)

	def apply_pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft):
	return pipeline(
	file,
	model_name,
	balance_type,
	apsn_transactions,
	max_fees_per_day,
	min_overdrawn_fee,
	min_transaction_overdraft
	)

	def redline_changes(original_path, revised_path, output_path):
	original_doc = Document(original_path)
	revised_doc = Document(revised_path)

	original_text = "\n".join([para.text for para in original_doc.paragraphs])
	revised_text = "\n".join([para.text for para in revised_doc.paragraphs])

	diff = unified_diff(original_text.splitlines(), revised_text.splitlines(), lineterm='')

	diff_doc = Document()
	for line in diff:
	if line.startswith('-'):
	p = diff_doc.add_paragraph(style='Normal')
	run = p.add_run(line)
	run.font.color.rgb = RGBColor(255, 0, 0) # Red
	elif line.startswith('+'):
	p = diff_doc.add_paragraph(style='Normal')
	run = p.add_run(line)
	run.font.color.rgb = RGBColor(0, 128, 0) # Green
	elif line.startswith('@@'):
	p = diff_doc.add_paragraph(style='Normal')
	run = p.add_run(line)
	run.font.color.rgb = RGBColor(0, 0, 255) # Blue
	else:
	diff_doc.add_paragraph(line, style='Normal')

	diff_doc.save(output_path)

	# Streamlit App
	st.title("Canarie AI Prototype")
	st.subheader("Finding the canarie in the coal mine")

	model_name = st.selectbox("Model", model_names())

	balance_type = st.selectbox("Do you charge on available balance or ledger balance?", ["available balance", "ledger balance"])

	apsn_transactions = st.selectbox("Do you charge for APSN transactions?", ["yes", "no"])

	max_fees_per_day = st.number_input("How many overdraft fees per day can be charged?", min_value=0, max_value=10)

	min_overdrawn_fee = st.number_input("What is the minimum amount overdrawn to incur a fee?", min_value=0, max_value=500)

	min_transaction_overdraft = st.number_input("What is the minimum transaction amount to trigger an overdraft?", min_value=0, max_value=500)

	uploaded_file = st.file_uploader("Choose a file", type=["pdf"])

	if uploaded_file is not None:
	with st.spinner('Please wait ...'):
	try:
	# Extract text with layout preservation
	extracted_text = pdf_to_text_with_layout(uploaded_file)

	original_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
	text_to_word_with_formatting(extracted_text, original_word_path)

	diff = apply_pipeline(
	uploaded_file,
	model_name,
	balance_type,
	apsn_transactions,
	max_fees_per_day,
	min_overdrawn_fee,
	min_transaction_overdraft
	)

	revised_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
	text_to_word_with_formatting(diff, revised_word_path)

	redlined_output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
	redline_changes(original_word_path, revised_word_path, redlined_output_path)

	with open(redlined_output_path, "rb") as f:
	st.download_button(
	label="Download Redlined Document",
	data=f,
	file_name="redlined_document.docx",
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	)
	st.success("Redlined document created successfully!")

	except Exception as e:
	st.exception(e)