Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import re | |
| import fitz # PyMuPDF for PDF extraction | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.cluster import KMeans | |
| from sentence_transformers import SentenceTransformer | |
| import nltk | |
| from nltk.corpus import stopwords | |
| # ---------- Setup ---------- | |
| nltk.download('stopwords', quiet=True) | |
| stop_words = set(stopwords.words('english')) | |
| # ---------- Helper: extract text from PDF ---------- | |
| def extract_text_from_pdf(pdf_path): | |
| text = "" | |
| with fitz.open(pdf_path) as doc: | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| # ---------- Helper: Transformer Topic Modeling ---------- | |
| def transformer_topic_modeling(sentences, auto_topics=True, num_topics=5): | |
| print("πΉ Using Transformer-based Embeddings...") | |
| model = SentenceTransformer("flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot") | |
| embeddings = model.encode(sentences) | |
| # Auto-detect number of topics | |
| if auto_topics: | |
| distortions = [] | |
| K = range(2, min(10, len(sentences)//2 + 2)) | |
| for k in K: | |
| km = KMeans(n_clusters=k, random_state=42).fit(embeddings) | |
| distortions.append(km.inertia_) | |
| diffs = np.diff(distortions) | |
| num_topics = K[np.argmin(diffs)] if len(diffs) > 0 else 3 | |
| kmeans = KMeans(n_clusters=num_topics, random_state=42) | |
| labels = kmeans.fit_predict(embeddings) | |
| df = pd.DataFrame({"Sentence": sentences, "Topic": labels}) | |
| topics = [] | |
| for i in range(num_topics): | |
| topic_sentences = df[df["Topic"] == i]["Sentence"].tolist() | |
| joined_text = " ".join(topic_sentences) | |
| # --- Extract keywords excluding stopwords --- | |
| words = re.findall(r"\b[a-z]{3,}\b", joined_text.lower()) | |
| filtered = [w for w in words if w not in stop_words] | |
| if filtered: | |
| top_words = pd.Series(filtered).value_counts().head(3).index.tolist() | |
| else: | |
| top_words = ["General"] | |
| title = " & ".join(top_words).title() | |
| topics.append((title, " ".join(topic_sentences[:3]))) | |
| return topics, num_topics | |
| # ---------- Main Function ---------- | |
| def analyze_input(pdf_file, essay_text): | |
| try: | |
| pdf_text = "" | |
| if pdf_file: | |
| pdf_text = extract_text_from_pdf(pdf_file.name) | |
| print("β PDF extracted successfully, length:", len(pdf_text)) | |
| full_text = (pdf_text + "\n" + (essay_text or "")).strip() | |
| if not full_text: | |
| return "β Please upload a PDF or write an essay." | |
| sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20] | |
| print("π§Ύ Sentence count:", len(sentences)) | |
| if len(sentences) < 2: | |
| return "β οΈ Not enough text for topic modeling." | |
| topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True) | |
| print("β Topics discovered:", num_topics) | |
| # Build Markdown output for Gradio | |
| output_lines = [f"β **Detected {num_topics} Topics:**\n"] | |
| for i, (title, examples) in enumerate(topic_data, 1): | |
| output_lines.append(f"**Topic {i}: {title}**\n{examples}\n") | |
| result = "\n\n".join(output_lines) | |
| return result # β Return string only | |
| except Exception as e: | |
| import traceback | |
| print(traceback.format_exc()) # full log in Hugging Face console | |
| return f"β οΈ Error: {str(e)}" | |
| # ---------- Gradio UI ---------- | |
| demo = gr.Interface( | |
| fn=analyze_input, | |
| inputs=[ | |
| gr.File(label="π Upload a PDF (optional)"), | |
| gr.Textbox(label="π Essay Text", lines=7, placeholder="Write or paste your essay here...") | |
| ], | |
| outputs=gr.Markdown(label="π§ Topic Analysis Result"), | |
| title="Topic Modeling App (PDF + Essay)", | |
| description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |