Mooo-osama03's picture
Update app.py
bd7a3b4 verified
import gradio as gr
import re
import fitz # PyMuPDF for PDF extraction
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
# ---------- Setup ----------
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
# ---------- Helper: extract text from PDF ----------
def extract_text_from_pdf(pdf_path):
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
return text
# ---------- Helper: Transformer Topic Modeling ----------
def transformer_topic_modeling(sentences, auto_topics=True, num_topics=5):
print("πŸ”Ή Using Transformer-based Embeddings...")
model = SentenceTransformer("flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot")
embeddings = model.encode(sentences)
# Auto-detect number of topics
if auto_topics:
distortions = []
K = range(2, min(10, len(sentences)//2 + 2))
for k in K:
km = KMeans(n_clusters=k, random_state=42).fit(embeddings)
distortions.append(km.inertia_)
diffs = np.diff(distortions)
num_topics = K[np.argmin(diffs)] if len(diffs) > 0 else 3
kmeans = KMeans(n_clusters=num_topics, random_state=42)
labels = kmeans.fit_predict(embeddings)
df = pd.DataFrame({"Sentence": sentences, "Topic": labels})
topics = []
for i in range(num_topics):
topic_sentences = df[df["Topic"] == i]["Sentence"].tolist()
joined_text = " ".join(topic_sentences)
# --- Extract keywords excluding stopwords ---
words = re.findall(r"\b[a-z]{3,}\b", joined_text.lower())
filtered = [w for w in words if w not in stop_words]
if filtered:
top_words = pd.Series(filtered).value_counts().head(3).index.tolist()
else:
top_words = ["General"]
title = " & ".join(top_words).title()
topics.append((title, " ".join(topic_sentences[:3])))
return topics, num_topics
# ---------- Main Function ----------
def analyze_input(pdf_file, essay_text):
try:
pdf_text = ""
if pdf_file:
pdf_text = extract_text_from_pdf(pdf_file.name)
print("βœ… PDF extracted successfully, length:", len(pdf_text))
full_text = (pdf_text + "\n" + (essay_text or "")).strip()
if not full_text:
return "❌ Please upload a PDF or write an essay."
sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
print("🧾 Sentence count:", len(sentences))
if len(sentences) < 2:
return "⚠️ Not enough text for topic modeling."
topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
print("βœ… Topics discovered:", num_topics)
# Build Markdown output for Gradio
output_lines = [f"βœ… **Detected {num_topics} Topics:**\n"]
for i, (title, examples) in enumerate(topic_data, 1):
output_lines.append(f"**Topic {i}: {title}**\n{examples}\n")
result = "\n\n".join(output_lines)
return result # βœ… Return string only
except Exception as e:
import traceback
print(traceback.format_exc()) # full log in Hugging Face console
return f"⚠️ Error: {str(e)}"
# ---------- Gradio UI ----------
demo = gr.Interface(
fn=analyze_input,
inputs=[
gr.File(label="πŸ“‚ Upload a PDF (optional)"),
gr.Textbox(label="πŸ“ Essay Text", lines=7, placeholder="Write or paste your essay here...")
],
outputs=gr.Markdown(label="🧠 Topic Analysis Result"),
title="Topic Modeling App (PDF + Essay)",
description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
)
if __name__ == "__main__":
demo.launch()