Spaces:

hymarog1
/

Legal_Doc

No application file

File size: 12,113 Bytes

52742d2

import streamlit as st
import shelve
import docx2txt
import PyPDF2
import time  # Used to simulate typing effect
import nltk

import re
import os
import requests
from dotenv import load_dotenv


import torch
from sentence_transformers import SentenceTransformer, util
import nltk

nltk.download('punkt')
import hashlib

from nltk import sent_tokenize
nltk.download('punkt')

nltk.download('punkt_tab')

from transformers import LEDTokenizer, LEDForConditionalGeneration
import torch

st.set_page_config(page_title="Legal Document Summarizer", layout="wide")

st.title("📄 Legal Document Summarizer (Upload)")

USER_AVATAR = "👤"
BOT_AVATAR = "🤖"

# Load chat history
def load_chat_history():
    with shelve.open("chat_history") as db:
        return db.get("messages", [])

# Save chat history
def save_chat_history(messages):
    with shelve.open("chat_history") as db:
        db["messages"] = messages

# Function to limit text preview to 500 words
def limit_text(text, word_limit=500):
    words = text.split()
    return " ".join(words[:word_limit]) + ("..." if len(words) > word_limit else "")


# CLEAN AND NORMALIZE TEXT


def clean_text(text):
    # Remove newlines and extra spaces
    text = text.replace('\r\n', ' ').replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    
    # Remove page number markers like "Page 1 of 10"
    text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)

    # Remove long dashed or underscored lines
    text = re.sub(r'[_]{5,}', '', text)   # Lines with underscores: _____
    text = re.sub(r'[-]{5,}', '', text)   # Lines with hyphens: -----
    
    # Remove long dotted separators
    text = re.sub(r'[.]{4,}', '', text)   # Dots like "......" or ".............."
    
    # Trim final leading/trailing whitespace
    text = text.strip()

    return text


#######################################################################################################################


# LOADING MODELS FOR DIVIDING TEXT INTO SECTIONS

# Load token from .env file
load_dotenv()
HF_API_TOKEN = os.getenv("HF_API_TOKEN")


def classify_zero_shot_hfapi(text, labels):
    if not HF_API_TOKEN:
        return "❌ Hugging Face token not found."

    headers = {
        "Authorization": f"Bearer {HF_API_TOKEN}"
    }

    payload = {
        "inputs": text,
        "parameters": {
            "candidate_labels": labels
        }
    }

    response = requests.post(
        "https://api-inference.huggingface.co/models/valhalla/distilbart-mnli-12-1",
        headers=headers,
        json=payload
    )

    if response.status_code != 200:
        return f"❌ Error from HF API: {response.status_code} - {response.text}"

    result = response.json()
    return result["labels"][0]  # Return the top label


# Labels for section classification
SECTION_LABELS = ["Facts", "Arguments", "Judgment", "Other"]


def classify_chunk(text):
    return classify_zero_shot_hfapi(text, SECTION_LABELS)
    # return result['labels'][0] if result and 'labels' in result else "Other"


# NEW: NLP-based sectioning using zero-shot classification
def section_by_zero_shot(text):
    sections = {"Facts": "", "Arguments": "", "Judgment": "", "Other": ""}
    sentences = sent_tokenize(text)
    chunk = ""

    for i, sent in enumerate(sentences):
        chunk += sent + " "
        if (i + 1) % 3 == 0 or i == len(sentences) - 1:
            label = classify_chunk(chunk.strip())
            print(f"🔎 Chunk: {chunk[:60]}...\n🔖 Predicted Label: {label}")
            # 👇 Normalize label (title case and fallback)
            label = label.capitalize()
            if label not in sections:
                label = "Other"
            sections[label] += chunk + "\n"
            chunk = ""

    return sections

#######################################################################################################################



# EXTRACTING TEXT FROM UPLOADED FILES

# Function to extract text from uploaded file
def extract_text(file):
    if file.name.endswith(".pdf"):
        reader = PyPDF2.PdfReader(file)
        full_text = "\n".join(page.extract_text() or "" for page in reader.pages)
    elif file.name.endswith(".docx"):
        full_text = docx2txt.process(file)
    elif file.name.endswith(".txt"):
        full_text = file.read().decode("utf-8")
    else:
        return "Unsupported file type."
    
    return full_text  # Full text is needed for summarization


#######################################################################################################################

# EXTRACTIVE AND ABSTRACTIVE SUMMARIZATION


@st.cache_resource
def load_legalbert():
    return SentenceTransformer("nlpaueb/legal-bert-base-uncased")


legalbert_model = load_legalbert()

@st.cache_resource
def load_led():
    tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
    model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")
    return tokenizer, model

tokenizer_led, model_led = load_led()


def legalbert_extractive_summary(text, top_ratio=0.2):
    sentences = sent_tokenize(text)
    top_k = max(3, int(len(sentences) * top_ratio))

    if len(sentences) <= top_k:
        return text

    # Embeddings & scoring
    sentence_embeddings = legalbert_model.encode(sentences, convert_to_tensor=True)
    doc_embedding = torch.mean(sentence_embeddings, dim=0)
    cosine_scores = util.pytorch_cos_sim(doc_embedding, sentence_embeddings)[0]
    top_results = torch.topk(cosine_scores, k=top_k)

    # Preserve original order
    selected_sentences = [sentences[i] for i in sorted(top_results.indices.tolist())]
    return " ".join(selected_sentences)



    # Add LED Abstractive Summarization


def led_abstractive_summary(text, max_length=512, min_length=100):
    inputs = tokenizer_led(
        text, return_tensors="pt", padding="max_length",
        truncation=True, max_length=4096
    )
    global_attention_mask = torch.zeros_like(inputs["input_ids"])
    global_attention_mask[:, 0] = 1  # Global attention on first token

    outputs = model_led.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        global_attention_mask=global_attention_mask,
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4
    )
    return tokenizer_led.decode(outputs[0], skip_special_tokens=True)


def hybrid_summary_by_section(text, top_ratio=0.8):
    cleaned_text = clean_text(text)
    sections = section_by_zero_shot(cleaned_text)  # Split into Facts, Arguments, Judgment, Other

    summary_parts = []
    for name, content in sections.items():
        if content.strip():
            # Calculate dynamic number of sentences to extract based on section length
            sentences = sent_tokenize(content)
            top_k = max(3, int(len(sentences) * top_ratio))

            # Extractive summary using Legal-BERT
            extractive = legalbert_extractive_summary(content, 0.8)

            # Abstractive summary using LED (handles long input)
            abstractive = led_abstractive_summary(extractive)

            # Combine both
            hybrid = f"📌 **Extractive Summary:**\n{extractive}\n\n🔍 **Abstractive Summary:**\n{abstractive}"
            summary_parts.append(f"### 📘 {name} Section:\n{clean_text(hybrid)}")

    return "\n\n".join(summary_parts)
    # return abstractive


#######################################################################################################################


# STREAMLIT APP INTERFACE CODE

# Initialize or load chat history
if "messages" not in st.session_state:
    st.session_state.messages = load_chat_history()

# Initialize last_uploaded if not set
if "last_uploaded" not in st.session_state:
    st.session_state.last_uploaded = None

# Sidebar with a button to delete chat history
with st.sidebar:
    st.subheader("⚙️ Options")
    if st.button("Delete Chat History"):
        st.session_state.messages = []
        st.session_state.last_uploaded = None
        save_chat_history([])

# Display chat messages with a typing effect
def display_with_typing_effect(text, speed=0.005):
    placeholder = st.empty()
    displayed_text = ""
    for char in text:
        displayed_text += char
        placeholder.markdown(displayed_text)
        time.sleep(speed)
    return displayed_text

# Show existing chat messages
for message in st.session_state.messages:
    avatar = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
    with st.chat_message(message["role"], avatar=avatar):
        st.markdown(message["content"])


# Standard chat input field
prompt = st.chat_input("Type a message...")

# # Place file uploader AFTER the chat input to keep layout consistent
# uploaded_file = st.file_uploader("📎 Upload a file (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"])

# Place uploader before the chat so it's always visible
with st.container():
    st.subheader("📎 Upload a Legal Document")
    uploaded_file = st.file_uploader("Upload a file (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"])
    reprocess_btn = st.button("🔄 Reprocess Last Uploaded File")


# Hashing logic
def get_file_hash(file):
    file.seek(0)
    content = file.read()
    file.seek(0)
    return hashlib.md5(content).hexdigest()

# # Handle file upload and generate hybrid summary
# if uploaded_file:
#     file_hash = get_file_hash(uploaded_file)

#     # Check if this file is already uploaded
#     if file_hash != st.session_state.get("last_uploaded_hash"):
#         raw_text = extract_text(uploaded_file)
#         summary_text = hybrid_summary_by_section(raw_text)

#         st.session_state.messages.append({
#             "role": "user",
#             "content": f"📤 Uploaded **{uploaded_file.name}**"
#         })

#         with st.chat_message("assistant", avatar=BOT_AVATAR):
#             preview_text = f"🧾 **Hybrid Summary of {uploaded_file.name}:**\n\n{summary_text}"
#             display_with_typing_effect(clean_text(preview_text), speed=0.000001)

#         st.session_state.messages.append({
#             "role": "assistant",
#             "content": preview_text
#         })

#         st.session_state.last_uploaded_hash = file_hash
#         save_chat_history(st.session_state.messages)

#         # Force rerun to reset uploader state & redraw layout properly
#         st.rerun()



if uploaded_file:
    file_hash = get_file_hash(uploaded_file)
    
    # Check if file is new OR reprocess is triggered
    if file_hash != st.session_state.get("last_uploaded_hash") or reprocess_btn:
        raw_text = extract_text(uploaded_file)
        summary_text = hybrid_summary_by_section(raw_text)

        st.session_state.messages.append({
            "role": "user",
            "content": f"📤 Uploaded **{uploaded_file.name}**"
        })

        with st.chat_message("assistant", avatar=BOT_AVATAR):
            preview_text = f"🧾 **Hybrid Summary of {uploaded_file.name}:**\n\n{summary_text}"
            display_with_typing_effect(clean_text(preview_text), speed=0.000001)

        st.session_state.messages.append({
            "role": "assistant",
            "content": preview_text
        })

        # Save this file hash only if it’s a new upload (avoid overwriting during reprocess)
        if not reprocess_btn:
            st.session_state.last_uploaded_hash = file_hash

        save_chat_history(st.session_state.messages)
        st.rerun()


# Handle chat input and return hybrid summary
if prompt:
    raw_text = prompt
    summary_text = hybrid_summary_by_section(raw_text)
    
    st.session_state.messages.append({
        "role": "user",
        "content": prompt
    })

    with st.chat_message("assistant", avatar=BOT_AVATAR):
        bot_response = f"📝 **Hybrid Summary of your text:**\n\n{summary_text}"
        display_with_typing_effect(clean_text(bot_response), speed=0.000005)

    st.session_state.messages.append({
        "role": "assistant",
        "content": bot_response
    })

    save_chat_history(st.session_state.messages)
    st.rerun()