Spaces:
Sleeping
Sleeping
# app.py β robust version (works with quotes_1000.csv, quotes.csv, or a public fallback) | |
import os | |
import pandas as pd | |
import gradio as gr | |
from sentence_transformers import SentenceTransformer, util | |
from datasets import load_dataset | |
import torch | |
# ---- 1) Load quotes from available source ---- | |
def load_quotes(): | |
# Priority 1: use your new 1000 quotes | |
if os.path.exists("quotes_1000.csv"): | |
df = pd.read_csv("quotes_1000.csv") | |
# Priority 2: use your older file if present | |
elif os.path.exists("quotes.csv"): | |
df = pd.read_csv("quotes.csv") | |
else: | |
# Priority 3 (fallback): load 1000 from public dataset | |
ds = load_dataset("Abirate/english_quotes", split="train") | |
df = pd.DataFrame({"quote": [row["quote"] for row in ds.select(range(1000))]}) | |
# Normalize column names (accept 'quote' or 'text' or single unnamed column) | |
cols = [c.lower() for c in df.columns] | |
df.columns = cols | |
if "quote" in df.columns: | |
series = df["quote"] | |
elif "text" in df.columns: | |
series = df["text"] | |
else: | |
# if there is only one column, use it | |
if len(df.columns) == 1: | |
series = df.iloc[:, 0] | |
else: | |
raise ValueError("CSV must have a 'quote' or 'text' column, or a single column.") | |
quotes = series.dropna().astype(str).tolist() | |
# Deduplicate while preserving order | |
seen = set() | |
unique_quotes = [] | |
for q in quotes: | |
if q not in seen and q.strip(): | |
unique_quotes.append(q.strip()) | |
seen.add(q.strip()) | |
if len(unique_quotes) < 10: | |
raise ValueError("Not enough quotes found. Please check your CSV.") | |
return unique_quotes | |
quotes = load_quotes() | |
# ---- 2) Load embedding model and build embeddings ---- | |
MODEL_NAME = "all-MiniLM-L6-v2" | |
model = SentenceTransformer(MODEL_NAME) | |
# tensors for cosine sim | |
corpus_emb = model.encode(quotes, convert_to_tensor=True, show_progress_bar=True) | |
# ---- 3) Search function (top-3 similar) ---- | |
def get_top3(user_input: str): | |
if not user_input or not user_input.strip(): | |
return "Please type something (e.g., 'happiness', 'overcoming failure', 'creativity')." | |
q_emb = model.encode(user_input, convert_to_tensor=True) | |
sims = util.cos_sim(q_emb, corpus_emb)[0] # tensor of similarities | |
topk = torch.topk(sims, k=3) | |
idxs = topk.indices.tolist() | |
results = [f"{i+1}. {quotes[idx]}" for i, idx in enumerate(idxs)] | |
return "\n\n".join(results) | |
# ---- 4) Gradio UI ---- | |
demo = gr.Interface( | |
fn=get_top3, | |
inputs=gr.Textbox(lines=2, placeholder="Type a theme or sentence..."), | |
outputs=gr.Textbox(label="Top 3 similar quotes"), | |
title="Quote Finder (Semantic Search)", | |
description="Enter any phrase to get 3 semantically similar quotes. Works with your uploaded CSV or a public fallback.", | |
examples=[["happiness"], ["overcoming failure"], ["friendship"]] | |
) | |
if __name__ == "__main__": | |
demo.launch() |