File size: 3,953 Bytes
6d5ccf0
 
 
 
 
 
 
bd7a3b4
 
6d5ccf0
bd7a3b4
 
 
6d5ccf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd7a3b4
 
 
 
 
 
 
 
 
6d5ccf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd7a3b4
6d5ccf0
 
 
 
 
bd7a3b4
6d5ccf0
 
 
bd7a3b4
6d5ccf0
 
 
 
 
 
 
 
 
 
 
bd7a3b4
6d5ccf0
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
import re
import fitz  # PyMuPDF for PDF extraction
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords

# ---------- Setup ----------
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

# ---------- Helper: extract text from PDF ----------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text


# ---------- Helper: Transformer Topic Modeling ----------
def transformer_topic_modeling(sentences, auto_topics=True, num_topics=5):
    print("πŸ”Ή Using Transformer-based Embeddings...")
    model = SentenceTransformer("flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot")

    embeddings = model.encode(sentences)

    # Auto-detect number of topics
    if auto_topics:
        distortions = []
        K = range(2, min(10, len(sentences)//2 + 2))
        for k in K:
            km = KMeans(n_clusters=k, random_state=42).fit(embeddings)
            distortions.append(km.inertia_)
        diffs = np.diff(distortions)
        num_topics = K[np.argmin(diffs)] if len(diffs) > 0 else 3

    kmeans = KMeans(n_clusters=num_topics, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    df = pd.DataFrame({"Sentence": sentences, "Topic": labels})

    topics = []
    for i in range(num_topics):
        topic_sentences = df[df["Topic"] == i]["Sentence"].tolist()
        joined_text = " ".join(topic_sentences)

        # --- Extract keywords excluding stopwords ---
        words = re.findall(r"\b[a-z]{3,}\b", joined_text.lower())
        filtered = [w for w in words if w not in stop_words]
        if filtered:
            top_words = pd.Series(filtered).value_counts().head(3).index.tolist()
        else:
            top_words = ["General"]

        title = " & ".join(top_words).title()
        topics.append((title, " ".join(topic_sentences[:3])))

    return topics, num_topics


# ---------- Main Function ----------
def analyze_input(pdf_file, essay_text):
    try:
        pdf_text = ""
        if pdf_file:
            pdf_text = extract_text_from_pdf(pdf_file.name)
            print("βœ… PDF extracted successfully, length:", len(pdf_text))

        full_text = (pdf_text + "\n" + (essay_text or "")).strip()
        if not full_text:
            return "❌ Please upload a PDF or write an essay."

        sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
        print("🧾 Sentence count:", len(sentences))

        if len(sentences) < 2:
            return "⚠️ Not enough text for topic modeling."

        topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
        print("βœ… Topics discovered:", num_topics)

        # Build Markdown output for Gradio
        output_lines = [f"βœ… **Detected {num_topics} Topics:**\n"]
        for i, (title, examples) in enumerate(topic_data, 1):
            output_lines.append(f"**Topic {i}: {title}**\n{examples}\n")
        result = "\n\n".join(output_lines)

        return result  # βœ… Return string only

    except Exception as e:
        import traceback
        print(traceback.format_exc())  # full log in Hugging Face console
        return f"⚠️ Error: {str(e)}"


# ---------- Gradio UI ----------
demo = gr.Interface(
    fn=analyze_input,
    inputs=[
        gr.File(label="πŸ“‚ Upload a PDF (optional)"),
        gr.Textbox(label="πŸ“ Essay Text", lines=7, placeholder="Write or paste your essay here...")
    ],
    outputs=gr.Markdown(label="🧠 Topic Analysis Result"),
    title="Topic Modeling App (PDF + Essay)",
    description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
)

if __name__ == "__main__":
    demo.launch()