Finalproject / app.py
nikolltt's picture
Update app.py
405c4e2 verified
# app.py β€” robust version (works with quotes_1000.csv, quotes.csv, or a public fallback)
import os
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
import torch
# ---- 1) Load quotes from available source ----
def load_quotes():
# Priority 1: use your new 1000 quotes
if os.path.exists("quotes_1000.csv"):
df = pd.read_csv("quotes_1000.csv")
# Priority 2: use your older file if present
elif os.path.exists("quotes.csv"):
df = pd.read_csv("quotes.csv")
else:
# Priority 3 (fallback): load 1000 from public dataset
ds = load_dataset("Abirate/english_quotes", split="train")
df = pd.DataFrame({"quote": [row["quote"] for row in ds.select(range(1000))]})
# Normalize column names (accept 'quote' or 'text' or single unnamed column)
cols = [c.lower() for c in df.columns]
df.columns = cols
if "quote" in df.columns:
series = df["quote"]
elif "text" in df.columns:
series = df["text"]
else:
# if there is only one column, use it
if len(df.columns) == 1:
series = df.iloc[:, 0]
else:
raise ValueError("CSV must have a 'quote' or 'text' column, or a single column.")
quotes = series.dropna().astype(str).tolist()
# Deduplicate while preserving order
seen = set()
unique_quotes = []
for q in quotes:
if q not in seen and q.strip():
unique_quotes.append(q.strip())
seen.add(q.strip())
if len(unique_quotes) < 10:
raise ValueError("Not enough quotes found. Please check your CSV.")
return unique_quotes
quotes = load_quotes()
# ---- 2) Load embedding model and build embeddings ----
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)
# tensors for cosine sim
corpus_emb = model.encode(quotes, convert_to_tensor=True, show_progress_bar=True)
# ---- 3) Search function (top-3 similar) ----
def get_top3(user_input: str):
if not user_input or not user_input.strip():
return "Please type something (e.g., 'happiness', 'overcoming failure', 'creativity')."
q_emb = model.encode(user_input, convert_to_tensor=True)
sims = util.cos_sim(q_emb, corpus_emb)[0] # tensor of similarities
topk = torch.topk(sims, k=3)
idxs = topk.indices.tolist()
results = [f"{i+1}. {quotes[idx]}" for i, idx in enumerate(idxs)]
return "\n\n".join(results)
# ---- 4) Gradio UI ----
demo = gr.Interface(
fn=get_top3,
inputs=gr.Textbox(lines=2, placeholder="Type a theme or sentence..."),
outputs=gr.Textbox(label="Top 3 similar quotes"),
title="Quote Finder (Semantic Search)",
description="Enter any phrase to get 3 semantically similar quotes. Works with your uploaded CSV or a public fallback.",
examples=[["happiness"], ["overcoming failure"], ["friendship"]]
)
if __name__ == "__main__":
demo.launch()