Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sentence_transformers import SentenceTransformer | |
from datasets import load_dataset | |
dataset = load_dataset( | |
"sheacon/song_lyrics", | |
revision="main" # tag name, or branch name, or commit hash | |
) | |
df = dataset.to_pandas() | |
minilm = SentenceTransformer('all-MiniLM-L12-v2') | |
#roberta = SentenceTransformer('all-distilroberta-v1') | |
#glove = SentenceTransformer('average_word_embeddings_glove.840B.300d') | |
# Tokenize and encode the song lyrics using the embedding model | |
song_embeddings = df["embedding"].tolist() | |
def search_songs(text, top_n=5): | |
# Tokenize and encode the text entry using the same embedding model | |
text_embedding = minilm([text])[0] | |
# Calculate the cosine similarity between the text entry embedding and each song embedding | |
similarities = cosine_similarity([text_embedding], song_embeddings)[0] | |
# Sort the songs by similarity score and return the top N songs with their titles and lyrics | |
top_indices = similarities.argsort()[::-1][:top_n] | |
results = [{"title": df.iloc[i]["title"], "lyrics": df.iloc[i]["lyrics"]} for i in top_indices] | |
return results | |
# Define the Gradio interface | |
iface = gr.Interface(search_songs, "textbox", "text", examples=[["I'm feeling lonely tonight"]]) | |
# Launch the interface | |
iface.launch() | |