File size: 2,963 Bytes
405c4e2
 
6f0e9b2
149c7e4
b7e9319
405c4e2
 
149c7e4
405c4e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f0e9b2
149c7e4
405c4e2
 
 
 
 
b9a61b8
 
b7e9319
405c4e2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# app.py — robust version (works with quotes_1000.csv, quotes.csv, or a public fallback)
import os
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
import torch

# ---- 1) Load quotes from available source ----
def load_quotes():
    # Priority 1: use your new 1000 quotes
    if os.path.exists("quotes_1000.csv"):
        df = pd.read_csv("quotes_1000.csv")
    # Priority 2: use your older file if present
    elif os.path.exists("quotes.csv"):
        df = pd.read_csv("quotes.csv")
    else:
        # Priority 3 (fallback): load 1000 from public dataset
        ds = load_dataset("Abirate/english_quotes", split="train")
        df = pd.DataFrame({"quote": [row["quote"] for row in ds.select(range(1000))]})

    # Normalize column names (accept 'quote' or 'text' or single unnamed column)
    cols = [c.lower() for c in df.columns]
    df.columns = cols
    if "quote" in df.columns:
        series = df["quote"]
    elif "text" in df.columns:
        series = df["text"]
    else:
        # if there is only one column, use it
        if len(df.columns) == 1:
            series = df.iloc[:, 0]
        else:
            raise ValueError("CSV must have a 'quote' or 'text' column, or a single column.")

    quotes = series.dropna().astype(str).tolist()
    # Deduplicate while preserving order
    seen = set()
    unique_quotes = []
    for q in quotes:
        if q not in seen and q.strip():
            unique_quotes.append(q.strip())
            seen.add(q.strip())
    if len(unique_quotes) < 10:
        raise ValueError("Not enough quotes found. Please check your CSV.")
    return unique_quotes

quotes = load_quotes()

# ---- 2) Load embedding model and build embeddings ----
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)
# tensors for cosine sim
corpus_emb = model.encode(quotes, convert_to_tensor=True, show_progress_bar=True)

# ---- 3) Search function (top-3 similar) ----
def get_top3(user_input: str):
    if not user_input or not user_input.strip():
        return "Please type something (e.g., 'happiness', 'overcoming failure', 'creativity')."
    q_emb = model.encode(user_input, convert_to_tensor=True)
    sims = util.cos_sim(q_emb, corpus_emb)[0]  # tensor of similarities
    topk = torch.topk(sims, k=3)
    idxs = topk.indices.tolist()
    results = [f"{i+1}. {quotes[idx]}" for i, idx in enumerate(idxs)]
    return "\n\n".join(results)

# ---- 4) Gradio UI ----
demo = gr.Interface(
    fn=get_top3,
    inputs=gr.Textbox(lines=2, placeholder="Type a theme or sentence..."),
    outputs=gr.Textbox(label="Top 3 similar quotes"),
    title="Quote Finder (Semantic Search)",
    description="Enter any phrase to get 3 semantically similar quotes. Works with your uploaded CSV or a public fallback.",
    examples=[["happiness"], ["overcoming failure"], ["friendship"]]
)

if __name__ == "__main__":
    demo.launch()