Spaces:
Sleeping
Sleeping
import streamlit as st | |
from langchain_pipeline import pipeline, model_names | |
import fitz # PyMuPDF | |
from docx import Document | |
from difflib import unified_diff | |
import tempfile | |
from docx.shared import RGBColor | |
import re | |
def pdf_to_text_with_layout(pdf_file): | |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
text = [] | |
for page_num in range(doc.page_count): | |
page = doc.load_page(page_num) | |
text.append(page.get_text("text")) | |
return "\n".join(text) | |
def clean_text(text): | |
# Remove non-ASCII and control characters | |
return ''.join(c for c in text if c.isprintable() and ord(c) < 65536) | |
def text_to_word_with_formatting(text, word_path): | |
doc = Document() | |
for line in text.split("\n"): | |
clean_line = clean_text(line) | |
doc.add_paragraph(clean_line) | |
doc.save(word_path) | |
def apply_pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft): | |
return pipeline( | |
file, | |
model_name, | |
balance_type, | |
apsn_transactions, | |
max_fees_per_day, | |
min_overdrawn_fee, | |
min_transaction_overdraft | |
) | |
def redline_changes(original_path, revised_path, output_path): | |
original_doc = Document(original_path) | |
revised_doc = Document(revised_path) | |
original_text = "\n".join([para.text for para in original_doc.paragraphs]) | |
revised_text = "\n".join([para.text for para in revised_doc.paragraphs]) | |
diff = unified_diff(original_text.splitlines(), revised_text.splitlines(), lineterm='') | |
diff_doc = Document() | |
for line in diff: | |
if line.startswith('-'): | |
p = diff_doc.add_paragraph(style='Normal') | |
run = p.add_run(line) | |
run.font.color.rgb = RGBColor(255, 0, 0) # Red | |
elif line.startswith('+'): | |
p = diff_doc.add_paragraph(style='Normal') | |
run = p.add_run(line) | |
run.font.color.rgb = RGBColor(0, 128, 0) # Green | |
elif line.startswith('@@'): | |
p = diff_doc.add_paragraph(style='Normal') | |
run = p.add_run(line) | |
run.font.color.rgb = RGBColor(0, 0, 255) # Blue | |
else: | |
diff_doc.add_paragraph(line, style='Normal') | |
diff_doc.save(output_path) | |
# Streamlit App | |
st.title("Canarie AI Prototype") | |
st.subheader("Finding the canarie in the coal mine") | |
model_name = st.selectbox("Model", model_names()) | |
balance_type = st.selectbox("Do you charge on available balance or ledger balance?", ["available balance", "ledger balance"]) | |
apsn_transactions = st.selectbox("Do you charge for APSN transactions?", ["yes", "no"]) | |
max_fees_per_day = st.number_input("How many overdraft fees per day can be charged?", min_value=0, max_value=10) | |
min_overdrawn_fee = st.number_input("What is the minimum amount overdrawn to incur a fee?", min_value=0, max_value=500) | |
min_transaction_overdraft = st.number_input("What is the minimum transaction amount to trigger an overdraft?", min_value=0, max_value=500) | |
uploaded_file = st.file_uploader("Choose a file", type=["pdf"]) | |
if uploaded_file is not None: | |
with st.spinner('Please wait ...'): | |
try: | |
# Extract text with layout preservation | |
extracted_text = pdf_to_text_with_layout(uploaded_file) | |
original_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name | |
text_to_word_with_formatting(extracted_text, original_word_path) | |
diff = apply_pipeline( | |
uploaded_file, | |
model_name, | |
balance_type, | |
apsn_transactions, | |
max_fees_per_day, | |
min_overdrawn_fee, | |
min_transaction_overdraft | |
) | |
revised_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name | |
text_to_word_with_formatting(diff, revised_word_path) | |
redlined_output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name | |
redline_changes(original_word_path, revised_word_path, redlined_output_path) | |
with open(redlined_output_path, "rb") as f: | |
st.download_button( | |
label="Download Redlined Document", | |
data=f, | |
file_name="redlined_document.docx", | |
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
) | |
st.success("Redlined document created successfully!") | |
except Exception as e: | |
st.exception(e) | |