# app.py — robust version (works with quotes_1000.csv, quotes.csv, or a public fallback) import os import pandas as pd import gradio as gr from sentence_transformers import SentenceTransformer, util from datasets import load_dataset import torch # ---- 1) Load quotes from available source ---- def load_quotes(): # Priority 1: use your new 1000 quotes if os.path.exists("quotes_1000.csv"): df = pd.read_csv("quotes_1000.csv") # Priority 2: use your older file if present elif os.path.exists("quotes.csv"): df = pd.read_csv("quotes.csv") else: # Priority 3 (fallback): load 1000 from public dataset ds = load_dataset("Abirate/english_quotes", split="train") df = pd.DataFrame({"quote": [row["quote"] for row in ds.select(range(1000))]}) # Normalize column names (accept 'quote' or 'text' or single unnamed column) cols = [c.lower() for c in df.columns] df.columns = cols if "quote" in df.columns: series = df["quote"] elif "text" in df.columns: series = df["text"] else: # if there is only one column, use it if len(df.columns) == 1: series = df.iloc[:, 0] else: raise ValueError("CSV must have a 'quote' or 'text' column, or a single column.") quotes = series.dropna().astype(str).tolist() # Deduplicate while preserving order seen = set() unique_quotes = [] for q in quotes: if q not in seen and q.strip(): unique_quotes.append(q.strip()) seen.add(q.strip()) if len(unique_quotes) < 10: raise ValueError("Not enough quotes found. Please check your CSV.") return unique_quotes quotes = load_quotes() # ---- 2) Load embedding model and build embeddings ---- MODEL_NAME = "all-MiniLM-L6-v2" model = SentenceTransformer(MODEL_NAME) # tensors for cosine sim corpus_emb = model.encode(quotes, convert_to_tensor=True, show_progress_bar=True) # ---- 3) Search function (top-3 similar) ---- def get_top3(user_input: str): if not user_input or not user_input.strip(): return "Please type something (e.g., 'happiness', 'overcoming failure', 'creativity')." q_emb = model.encode(user_input, convert_to_tensor=True) sims = util.cos_sim(q_emb, corpus_emb)[0] # tensor of similarities topk = torch.topk(sims, k=3) idxs = topk.indices.tolist() results = [f"{i+1}. {quotes[idx]}" for i, idx in enumerate(idxs)] return "\n\n".join(results) # ---- 4) Gradio UI ---- demo = gr.Interface( fn=get_top3, inputs=gr.Textbox(lines=2, placeholder="Type a theme or sentence..."), outputs=gr.Textbox(label="Top 3 similar quotes"), title="Quote Finder (Semantic Search)", description="Enter any phrase to get 3 semantically similar quotes. Works with your uploaded CSV or a public fallback.", examples=[["happiness"], ["overcoming failure"], ["friendship"]] ) if __name__ == "__main__": demo.launch()