Spaces:

hymarog1
/

Legal_Doc

No application file

Legal_Doc / upload.py

Hyma Roshini Gompa

Initial commit for Streamlit app

52742d2 3 months ago

12.1 kB

	import streamlit as st
	import shelve
	import docx2txt
	import PyPDF2
	import time # Used to simulate typing effect
	import nltk

	import re
	import os
	import requests
	from dotenv import load_dotenv


	import torch
	from sentence_transformers import SentenceTransformer, util
	import nltk

	nltk.download('punkt')
	import hashlib

	from nltk import sent_tokenize
	nltk.download('punkt')

	nltk.download('punkt_tab')

	from transformers import LEDTokenizer, LEDForConditionalGeneration
	import torch

	st.set_page_config(page_title="Legal Document Summarizer", layout="wide")

	st.title("📄 Legal Document Summarizer (Upload)")

	USER_AVATAR = "👤"
	BOT_AVATAR = "🤖"

	# Load chat history
	def load_chat_history():
	with shelve.open("chat_history") as db:
	return db.get("messages", [])

	# Save chat history
	def save_chat_history(messages):
	with shelve.open("chat_history") as db:
	db["messages"] = messages

	# Function to limit text preview to 500 words
	def limit_text(text, word_limit=500):
	words = text.split()
	return " ".join(words[:word_limit]) + ("..." if len(words) > word_limit else "")


	# CLEAN AND NORMALIZE TEXT


	def clean_text(text):
	# Remove newlines and extra spaces
	text = text.replace('\r\n', ' ').replace('\n', ' ')
	text = re.sub(r'\s+', ' ', text)

	# Remove page number markers like "Page 1 of 10"
	text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)

	# Remove long dashed or underscored lines
	text = re.sub(r'[_]{5,}', '', text) # Lines with underscores: _____
	text = re.sub(r'[-]{5,}', '', text) # Lines with hyphens: -----

	# Remove long dotted separators
	text = re.sub(r'[.]{4,}', '', text) # Dots like "......" or ".............."

	# Trim final leading/trailing whitespace
	text = text.strip()

	return text


	#######################################################################################################################


	# LOADING MODELS FOR DIVIDING TEXT INTO SECTIONS

	# Load token from .env file
	load_dotenv()
	HF_API_TOKEN = os.getenv("HF_API_TOKEN")


	def classify_zero_shot_hfapi(text, labels):
	if not HF_API_TOKEN:
	return "❌ Hugging Face token not found."

	headers = {
	"Authorization": f"Bearer {HF_API_TOKEN}"
	}

	payload = {
	"inputs": text,
	"parameters": {
	"candidate_labels": labels
	}
	}

	response = requests.post(
	"https://api-inference.huggingface.co/models/valhalla/distilbart-mnli-12-1",
	headers=headers,
	json=payload
	)

	if response.status_code != 200:
	return f"❌ Error from HF API: {response.status_code} - {response.text}"

	result = response.json()
	return result["labels"][0] # Return the top label


	# Labels for section classification
	SECTION_LABELS = ["Facts", "Arguments", "Judgment", "Other"]


	def classify_chunk(text):
	return classify_zero_shot_hfapi(text, SECTION_LABELS)
	# return result['labels'][0] if result and 'labels' in result else "Other"


	# NEW: NLP-based sectioning using zero-shot classification
	def section_by_zero_shot(text):
	sections = {"Facts": "", "Arguments": "", "Judgment": "", "Other": ""}
	sentences = sent_tokenize(text)
	chunk = ""

	for i, sent in enumerate(sentences):
	chunk += sent + " "
	if (i + 1) % 3 == 0 or i == len(sentences) - 1:
	label = classify_chunk(chunk.strip())
	print(f"🔎 Chunk: {chunk[:60]}...\n🔖 Predicted Label: {label}")
	# 👇 Normalize label (title case and fallback)
	label = label.capitalize()
	if label not in sections:
	label = "Other"
	sections[label] += chunk + "\n"
	chunk = ""

	return sections

	#######################################################################################################################



	# EXTRACTING TEXT FROM UPLOADED FILES

	# Function to extract text from uploaded file
	def extract_text(file):
	if file.name.endswith(".pdf"):
	reader = PyPDF2.PdfReader(file)
	full_text = "\n".join(page.extract_text() or "" for page in reader.pages)
	elif file.name.endswith(".docx"):
	full_text = docx2txt.process(file)
	elif file.name.endswith(".txt"):
	full_text = file.read().decode("utf-8")
	else:
	return "Unsupported file type."

	return full_text # Full text is needed for summarization


	#######################################################################################################################

	# EXTRACTIVE AND ABSTRACTIVE SUMMARIZATION


	@st.cache_resource
	def load_legalbert():
	return SentenceTransformer("nlpaueb/legal-bert-base-uncased")


	legalbert_model = load_legalbert()

	@st.cache_resource
	def load_led():
	tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
	model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")
	return tokenizer, model

	tokenizer_led, model_led = load_led()


	def legalbert_extractive_summary(text, top_ratio=0.2):
	sentences = sent_tokenize(text)
	top_k = max(3, int(len(sentences) * top_ratio))

	if len(sentences) <= top_k:
	return text

	# Embeddings & scoring
	sentence_embeddings = legalbert_model.encode(sentences, convert_to_tensor=True)
	doc_embedding = torch.mean(sentence_embeddings, dim=0)
	cosine_scores = util.pytorch_cos_sim(doc_embedding, sentence_embeddings)[0]
	top_results = torch.topk(cosine_scores, k=top_k)

	# Preserve original order
	selected_sentences = [sentences[i] for i in sorted(top_results.indices.tolist())]
	return " ".join(selected_sentences)



	# Add LED Abstractive Summarization


	def led_abstractive_summary(text, max_length=512, min_length=100):
	inputs = tokenizer_led(
	text, return_tensors="pt", padding="max_length",
	truncation=True, max_length=4096
	)
	global_attention_mask = torch.zeros_like(inputs["input_ids"])
	global_attention_mask[:, 0] = 1 # Global attention on first token

	outputs = model_led.generate(
	inputs["input_ids"],
	attention_mask=inputs["attention_mask"],
	global_attention_mask=global_attention_mask,
	max_length=max_length,
	min_length=min_length,
	length_penalty=2.0,
	num_beams=4
	)
	return tokenizer_led.decode(outputs[0], skip_special_tokens=True)


	def hybrid_summary_by_section(text, top_ratio=0.8):
	cleaned_text = clean_text(text)
	sections = section_by_zero_shot(cleaned_text) # Split into Facts, Arguments, Judgment, Other

	summary_parts = []
	for name, content in sections.items():
	if content.strip():
	# Calculate dynamic number of sentences to extract based on section length
	sentences = sent_tokenize(content)
	top_k = max(3, int(len(sentences) * top_ratio))

	# Extractive summary using Legal-BERT
	extractive = legalbert_extractive_summary(content, 0.8)

	# Abstractive summary using LED (handles long input)
	abstractive = led_abstractive_summary(extractive)

	# Combine both
	hybrid = f"📌 Extractive Summary:\n{extractive}\n\n🔍 Abstractive Summary:\n{abstractive}"
	summary_parts.append(f"### 📘 {name} Section:\n{clean_text(hybrid)}")

	return "\n\n".join(summary_parts)
	# return abstractive


	#######################################################################################################################


	# STREAMLIT APP INTERFACE CODE

	# Initialize or load chat history
	if "messages" not in st.session_state:
	st.session_state.messages = load_chat_history()

	# Initialize last_uploaded if not set
	if "last_uploaded" not in st.session_state:
	st.session_state.last_uploaded = None

	# Sidebar with a button to delete chat history
	with st.sidebar:
	st.subheader("⚙️ Options")
	if st.button("Delete Chat History"):
	st.session_state.messages = []
	st.session_state.last_uploaded = None
	save_chat_history([])

	# Display chat messages with a typing effect
	def display_with_typing_effect(text, speed=0.005):
	placeholder = st.empty()
	displayed_text = ""
	for char in text:
	displayed_text += char
	placeholder.markdown(displayed_text)
	time.sleep(speed)
	return displayed_text

	# Show existing chat messages
	for message in st.session_state.messages:
	avatar = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
	with st.chat_message(message["role"], avatar=avatar):
	st.markdown(message["content"])


	# Standard chat input field
	prompt = st.chat_input("Type a message...")

	# # Place file uploader AFTER the chat input to keep layout consistent
	# uploaded_file = st.file_uploader("📎 Upload a file (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"])

	# Place uploader before the chat so it's always visible
	with st.container():
	st.subheader("📎 Upload a Legal Document")
	uploaded_file = st.file_uploader("Upload a file (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"])
	reprocess_btn = st.button("🔄 Reprocess Last Uploaded File")


	# Hashing logic
	def get_file_hash(file):
	file.seek(0)
	content = file.read()
	file.seek(0)
	return hashlib.md5(content).hexdigest()

	# # Handle file upload and generate hybrid summary
	# if uploaded_file:
	# file_hash = get_file_hash(uploaded_file)

	# # Check if this file is already uploaded
	# if file_hash != st.session_state.get("last_uploaded_hash"):
	# raw_text = extract_text(uploaded_file)
	# summary_text = hybrid_summary_by_section(raw_text)

	# st.session_state.messages.append({
	# "role": "user",
	# "content": f"📤 Uploaded {uploaded_file.name}"
	# })

	# with st.chat_message("assistant", avatar=BOT_AVATAR):
	# preview_text = f"🧾 Hybrid Summary of {uploaded_file.name}:\n\n{summary_text}"
	# display_with_typing_effect(clean_text(preview_text), speed=0.000001)

	# st.session_state.messages.append({
	# "role": "assistant",
	# "content": preview_text
	# })

	# st.session_state.last_uploaded_hash = file_hash
	# save_chat_history(st.session_state.messages)

	# # Force rerun to reset uploader state & redraw layout properly
	# st.rerun()



	if uploaded_file:
	file_hash = get_file_hash(uploaded_file)

	# Check if file is new OR reprocess is triggered
	if file_hash != st.session_state.get("last_uploaded_hash") or reprocess_btn:
	raw_text = extract_text(uploaded_file)
	summary_text = hybrid_summary_by_section(raw_text)

	st.session_state.messages.append({
	"role": "user",
	"content": f"📤 Uploaded {uploaded_file.name}"
	})

	with st.chat_message("assistant", avatar=BOT_AVATAR):
	preview_text = f"🧾 Hybrid Summary of {uploaded_file.name}:\n\n{summary_text}"
	display_with_typing_effect(clean_text(preview_text), speed=0.000001)

	st.session_state.messages.append({
	"role": "assistant",
	"content": preview_text
	})

	# Save this file hash only if it’s a new upload (avoid overwriting during reprocess)
	if not reprocess_btn:
	st.session_state.last_uploaded_hash = file_hash

	save_chat_history(st.session_state.messages)
	st.rerun()


	# Handle chat input and return hybrid summary
	if prompt:
	raw_text = prompt
	summary_text = hybrid_summary_by_section(raw_text)

	st.session_state.messages.append({
	"role": "user",
	"content": prompt
	})

	with st.chat_message("assistant", avatar=BOT_AVATAR):
	bot_response = f"📝 Hybrid Summary of your text:\n\n{summary_text}"
	display_with_typing_effect(clean_text(bot_response), speed=0.000005)

	st.session_state.messages.append({
	"role": "assistant",
	"content": bot_response
	})

	save_chat_history(st.session_state.messages)
	st.rerun()