import streamlit as st import shelve import docx2txt import PyPDF2 import time # Used to simulate typing effect import nltk import re import os import requests from dotenv import load_dotenv import torch from sentence_transformers import SentenceTransformer, util import nltk nltk.download('punkt') import hashlib from nltk import sent_tokenize nltk.download('punkt') nltk.download('punkt_tab') from transformers import LEDTokenizer, LEDForConditionalGeneration import torch st.set_page_config(page_title="Legal Document Summarizer", layout="wide") st.title("πŸ“„ Legal Document Summarizer (Upload)") USER_AVATAR = "πŸ‘€" BOT_AVATAR = "πŸ€–" # Load chat history def load_chat_history(): with shelve.open("chat_history") as db: return db.get("messages", []) # Save chat history def save_chat_history(messages): with shelve.open("chat_history") as db: db["messages"] = messages # Function to limit text preview to 500 words def limit_text(text, word_limit=500): words = text.split() return " ".join(words[:word_limit]) + ("..." if len(words) > word_limit else "") # CLEAN AND NORMALIZE TEXT def clean_text(text): # Remove newlines and extra spaces text = text.replace('\r\n', ' ').replace('\n', ' ') text = re.sub(r'\s+', ' ', text) # Remove page number markers like "Page 1 of 10" text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE) # Remove long dashed or underscored lines text = re.sub(r'[_]{5,}', '', text) # Lines with underscores: _____ text = re.sub(r'[-]{5,}', '', text) # Lines with hyphens: ----- # Remove long dotted separators text = re.sub(r'[.]{4,}', '', text) # Dots like "......" or ".............." # Trim final leading/trailing whitespace text = text.strip() return text ####################################################################################################################### # LOADING MODELS FOR DIVIDING TEXT INTO SECTIONS # Load token from .env file load_dotenv() HF_API_TOKEN = os.getenv("HF_API_TOKEN") def classify_zero_shot_hfapi(text, labels): if not HF_API_TOKEN: return "❌ Hugging Face token not found." headers = { "Authorization": f"Bearer {HF_API_TOKEN}" } payload = { "inputs": text, "parameters": { "candidate_labels": labels } } response = requests.post( "https://api-inference.huggingface.co/models/valhalla/distilbart-mnli-12-1", headers=headers, json=payload ) if response.status_code != 200: return f"❌ Error from HF API: {response.status_code} - {response.text}" result = response.json() return result["labels"][0] # Return the top label # Labels for section classification SECTION_LABELS = ["Facts", "Arguments", "Judgment", "Other"] def classify_chunk(text): return classify_zero_shot_hfapi(text, SECTION_LABELS) # return result['labels'][0] if result and 'labels' in result else "Other" # NEW: NLP-based sectioning using zero-shot classification def section_by_zero_shot(text): sections = {"Facts": "", "Arguments": "", "Judgment": "", "Other": ""} sentences = sent_tokenize(text) chunk = "" for i, sent in enumerate(sentences): chunk += sent + " " if (i + 1) % 3 == 0 or i == len(sentences) - 1: label = classify_chunk(chunk.strip()) print(f"πŸ”Ž Chunk: {chunk[:60]}...\nπŸ”– Predicted Label: {label}") # πŸ‘‡ Normalize label (title case and fallback) label = label.capitalize() if label not in sections: label = "Other" sections[label] += chunk + "\n" chunk = "" return sections ####################################################################################################################### # EXTRACTING TEXT FROM UPLOADED FILES # Function to extract text from uploaded file def extract_text(file): if file.name.endswith(".pdf"): reader = PyPDF2.PdfReader(file) full_text = "\n".join(page.extract_text() or "" for page in reader.pages) elif file.name.endswith(".docx"): full_text = docx2txt.process(file) elif file.name.endswith(".txt"): full_text = file.read().decode("utf-8") else: return "Unsupported file type." return full_text # Full text is needed for summarization ####################################################################################################################### # EXTRACTIVE AND ABSTRACTIVE SUMMARIZATION @st.cache_resource def load_legalbert(): return SentenceTransformer("nlpaueb/legal-bert-base-uncased") legalbert_model = load_legalbert() @st.cache_resource def load_led(): tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384") model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384") return tokenizer, model tokenizer_led, model_led = load_led() def legalbert_extractive_summary(text, top_ratio=0.2): sentences = sent_tokenize(text) top_k = max(3, int(len(sentences) * top_ratio)) if len(sentences) <= top_k: return text # Embeddings & scoring sentence_embeddings = legalbert_model.encode(sentences, convert_to_tensor=True) doc_embedding = torch.mean(sentence_embeddings, dim=0) cosine_scores = util.pytorch_cos_sim(doc_embedding, sentence_embeddings)[0] top_results = torch.topk(cosine_scores, k=top_k) # Preserve original order selected_sentences = [sentences[i] for i in sorted(top_results.indices.tolist())] return " ".join(selected_sentences) # Add LED Abstractive Summarization def led_abstractive_summary(text, max_length=512, min_length=100): inputs = tokenizer_led( text, return_tensors="pt", padding="max_length", truncation=True, max_length=4096 ) global_attention_mask = torch.zeros_like(inputs["input_ids"]) global_attention_mask[:, 0] = 1 # Global attention on first token outputs = model_led.generate( inputs["input_ids"], attention_mask=inputs["attention_mask"], global_attention_mask=global_attention_mask, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4 ) return tokenizer_led.decode(outputs[0], skip_special_tokens=True) def hybrid_summary_by_section(text, top_ratio=0.8): cleaned_text = clean_text(text) sections = section_by_zero_shot(cleaned_text) # Split into Facts, Arguments, Judgment, Other summary_parts = [] for name, content in sections.items(): if content.strip(): # Calculate dynamic number of sentences to extract based on section length sentences = sent_tokenize(content) top_k = max(3, int(len(sentences) * top_ratio)) # Extractive summary using Legal-BERT extractive = legalbert_extractive_summary(content, 0.8) # Abstractive summary using LED (handles long input) abstractive = led_abstractive_summary(extractive) # Combine both hybrid = f"πŸ“Œ **Extractive Summary:**\n{extractive}\n\nπŸ” **Abstractive Summary:**\n{abstractive}" summary_parts.append(f"### πŸ“˜ {name} Section:\n{clean_text(hybrid)}") return "\n\n".join(summary_parts) # return abstractive ####################################################################################################################### # STREAMLIT APP INTERFACE CODE # Initialize or load chat history if "messages" not in st.session_state: st.session_state.messages = load_chat_history() # Initialize last_uploaded if not set if "last_uploaded" not in st.session_state: st.session_state.last_uploaded = None # Sidebar with a button to delete chat history with st.sidebar: st.subheader("βš™οΈ Options") if st.button("Delete Chat History"): st.session_state.messages = [] st.session_state.last_uploaded = None save_chat_history([]) # Display chat messages with a typing effect def display_with_typing_effect(text, speed=0.005): placeholder = st.empty() displayed_text = "" for char in text: displayed_text += char placeholder.markdown(displayed_text) time.sleep(speed) return displayed_text # Show existing chat messages for message in st.session_state.messages: avatar = USER_AVATAR if message["role"] == "user" else BOT_AVATAR with st.chat_message(message["role"], avatar=avatar): st.markdown(message["content"]) # Standard chat input field prompt = st.chat_input("Type a message...") # # Place file uploader AFTER the chat input to keep layout consistent # uploaded_file = st.file_uploader("πŸ“Ž Upload a file (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"]) # Place uploader before the chat so it's always visible with st.container(): st.subheader("πŸ“Ž Upload a Legal Document") uploaded_file = st.file_uploader("Upload a file (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"]) reprocess_btn = st.button("πŸ”„ Reprocess Last Uploaded File") # Hashing logic def get_file_hash(file): file.seek(0) content = file.read() file.seek(0) return hashlib.md5(content).hexdigest() # # Handle file upload and generate hybrid summary # if uploaded_file: # file_hash = get_file_hash(uploaded_file) # # Check if this file is already uploaded # if file_hash != st.session_state.get("last_uploaded_hash"): # raw_text = extract_text(uploaded_file) # summary_text = hybrid_summary_by_section(raw_text) # st.session_state.messages.append({ # "role": "user", # "content": f"πŸ“€ Uploaded **{uploaded_file.name}**" # }) # with st.chat_message("assistant", avatar=BOT_AVATAR): # preview_text = f"🧾 **Hybrid Summary of {uploaded_file.name}:**\n\n{summary_text}" # display_with_typing_effect(clean_text(preview_text), speed=0.000001) # st.session_state.messages.append({ # "role": "assistant", # "content": preview_text # }) # st.session_state.last_uploaded_hash = file_hash # save_chat_history(st.session_state.messages) # # Force rerun to reset uploader state & redraw layout properly # st.rerun() if uploaded_file: file_hash = get_file_hash(uploaded_file) # Check if file is new OR reprocess is triggered if file_hash != st.session_state.get("last_uploaded_hash") or reprocess_btn: raw_text = extract_text(uploaded_file) summary_text = hybrid_summary_by_section(raw_text) st.session_state.messages.append({ "role": "user", "content": f"πŸ“€ Uploaded **{uploaded_file.name}**" }) with st.chat_message("assistant", avatar=BOT_AVATAR): preview_text = f"🧾 **Hybrid Summary of {uploaded_file.name}:**\n\n{summary_text}" display_with_typing_effect(clean_text(preview_text), speed=0.000001) st.session_state.messages.append({ "role": "assistant", "content": preview_text }) # Save this file hash only if it’s a new upload (avoid overwriting during reprocess) if not reprocess_btn: st.session_state.last_uploaded_hash = file_hash save_chat_history(st.session_state.messages) st.rerun() # Handle chat input and return hybrid summary if prompt: raw_text = prompt summary_text = hybrid_summary_by_section(raw_text) st.session_state.messages.append({ "role": "user", "content": prompt }) with st.chat_message("assistant", avatar=BOT_AVATAR): bot_response = f"πŸ“ **Hybrid Summary of your text:**\n\n{summary_text}" display_with_typing_effect(clean_text(bot_response), speed=0.000005) st.session_state.messages.append({ "role": "assistant", "content": bot_response }) save_chat_history(st.session_state.messages) st.rerun()