Spaces:

Mooo-osama03
/

Topicclassification

Sleeping

File size: 3,953 Bytes

import gradio as gr
import re
import fitz  # PyMuPDF for PDF extraction
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords

# ---------- Setup ----------
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

# ---------- Helper: extract text from PDF ----------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text


# ---------- Helper: Transformer Topic Modeling ----------
def transformer_topic_modeling(sentences, auto_topics=True, num_topics=5):
    print("🔹 Using Transformer-based Embeddings...")
    model = SentenceTransformer("flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot")

    embeddings = model.encode(sentences)

    # Auto-detect number of topics
    if auto_topics:
        distortions = []
        K = range(2, min(10, len(sentences)//2 + 2))
        for k in K:
            km = KMeans(n_clusters=k, random_state=42).fit(embeddings)
            distortions.append(km.inertia_)
        diffs = np.diff(distortions)
        num_topics = K[np.argmin(diffs)] if len(diffs) > 0 else 3

    kmeans = KMeans(n_clusters=num_topics, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    df = pd.DataFrame({"Sentence": sentences, "Topic": labels})

    topics = []
    for i in range(num_topics):
        topic_sentences = df[df["Topic"] == i]["Sentence"].tolist()
        joined_text = " ".join(topic_sentences)

        # --- Extract keywords excluding stopwords ---
        words = re.findall(r"\b[a-z]{3,}\b", joined_text.lower())
        filtered = [w for w in words if w not in stop_words]
        if filtered:
            top_words = pd.Series(filtered).value_counts().head(3).index.tolist()
        else:
            top_words = ["General"]

        title = " & ".join(top_words).title()
        topics.append((title, " ".join(topic_sentences[:3])))

    return topics, num_topics


# ---------- Main Function ----------
def analyze_input(pdf_file, essay_text):
    try:
        pdf_text = ""
        if pdf_file:
            pdf_text = extract_text_from_pdf(pdf_file.name)
            print("✅ PDF extracted successfully, length:", len(pdf_text))

        full_text = (pdf_text + "\n" + (essay_text or "")).strip()
        if not full_text:
            return "❌ Please upload a PDF or write an essay."

        sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
        print("🧾 Sentence count:", len(sentences))

        if len(sentences) < 2:
            return "⚠️ Not enough text for topic modeling."

        topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
        print("✅ Topics discovered:", num_topics)

        # Build Markdown output for Gradio
        output_lines = [f"✅ **Detected {num_topics} Topics:**\n"]
        for i, (title, examples) in enumerate(topic_data, 1):
            output_lines.append(f"**Topic {i}: {title}**\n{examples}\n")
        result = "\n\n".join(output_lines)

        return result  # ✅ Return string only

    except Exception as e:
        import traceback
        print(traceback.format_exc())  # full log in Hugging Face console
        return f"⚠️ Error: {str(e)}"


# ---------- Gradio UI ----------
demo = gr.Interface(
    fn=analyze_input,
    inputs=[
        gr.File(label="📂 Upload a PDF (optional)"),
        gr.Textbox(label="📝 Essay Text", lines=7, placeholder="Write or paste your essay here...")
    ],
    outputs=gr.Markdown(label="🧠 Topic Analysis Result"),
    title="Topic Modeling App (PDF + Essay)",
    description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
)

if __name__ == "__main__":
    demo.launch()