Spaces:

Mooo-osama03
/

Topicclassification

Sleeping

App Files Files Community

Topicclassification / app.py

Mooo-osama03

Update app.py

bd7a3b4 verified about 1 month ago

raw

history blame contribute delete

3.95 kB

	import gradio as gr
	import re
	import fitz # PyMuPDF for PDF extraction
	import pandas as pd
	import numpy as np
	from sklearn.cluster import KMeans
	from sentence_transformers import SentenceTransformer
	import nltk
	from nltk.corpus import stopwords

	# ---------- Setup ----------
	nltk.download('stopwords', quiet=True)
	stop_words = set(stopwords.words('english'))

	# ---------- Helper: extract text from PDF ----------
	def extract_text_from_pdf(pdf_path):
	text = ""
	with fitz.open(pdf_path) as doc:
	for page in doc:
	text += page.get_text()
	return text


	# ---------- Helper: Transformer Topic Modeling ----------
	def transformer_topic_modeling(sentences, auto_topics=True, num_topics=5):
	print("🔹 Using Transformer-based Embeddings...")
	model = SentenceTransformer("flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot")

	embeddings = model.encode(sentences)

	# Auto-detect number of topics
	if auto_topics:
	distortions = []
	K = range(2, min(10, len(sentences)//2 + 2))
	for k in K:
	km = KMeans(n_clusters=k, random_state=42).fit(embeddings)
	distortions.append(km.inertia_)
	diffs = np.diff(distortions)
	num_topics = K[np.argmin(diffs)] if len(diffs) > 0 else 3

	kmeans = KMeans(n_clusters=num_topics, random_state=42)
	labels = kmeans.fit_predict(embeddings)
	df = pd.DataFrame({"Sentence": sentences, "Topic": labels})

	topics = []
	for i in range(num_topics):
	topic_sentences = df[df["Topic"] == i]["Sentence"].tolist()
	joined_text = " ".join(topic_sentences)

	# --- Extract keywords excluding stopwords ---
	words = re.findall(r"\b[a-z]{3,}\b", joined_text.lower())
	filtered = [w for w in words if w not in stop_words]
	if filtered:
	top_words = pd.Series(filtered).value_counts().head(3).index.tolist()
	else:
	top_words = ["General"]

	title = " & ".join(top_words).title()
	topics.append((title, " ".join(topic_sentences[:3])))

	return topics, num_topics


	# ---------- Main Function ----------
	def analyze_input(pdf_file, essay_text):
	try:
	pdf_text = ""
	if pdf_file:
	pdf_text = extract_text_from_pdf(pdf_file.name)
	print("✅ PDF extracted successfully, length:", len(pdf_text))

	full_text = (pdf_text + "\n" + (essay_text or "")).strip()
	if not full_text:
	return "❌ Please upload a PDF or write an essay."

	sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
	print("🧾 Sentence count:", len(sentences))

	if len(sentences) < 2:
	return "⚠️ Not enough text for topic modeling."

	topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
	print("✅ Topics discovered:", num_topics)

	# Build Markdown output for Gradio
	output_lines = [f"✅ Detected {num_topics} Topics:\n"]
	for i, (title, examples) in enumerate(topic_data, 1):
	output_lines.append(f"Topic {i}: {title}\n{examples}\n")
	result = "\n\n".join(output_lines)

	return result # ✅ Return string only

	except Exception as e:
	import traceback
	print(traceback.format_exc()) # full log in Hugging Face console
	return f"⚠️ Error: {str(e)}"


	# ---------- Gradio UI ----------
	demo = gr.Interface(
	fn=analyze_input,
	inputs=[
	gr.File(label="📂 Upload a PDF (optional)"),
	gr.Textbox(label="📝 Essay Text", lines=7, placeholder="Write or paste your essay here...")
	],
	outputs=gr.Markdown(label="🧠 Topic Analysis Result"),
	title="Topic Modeling App (PDF + Essay)",
	description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
	)

	if __name__ == "__main__":
	demo.launch()