Shea
update
511e1bc
import gradio as gr
from gradio import components
import numpy as np
import pandas as pd
import pyarrow
import os
import requests
url = 'https://huggingface.co/datasets/sheacon/song_lyrics/resolve/main/v2ga_w_embeddings_half.parquet'
response = requests.get(url, stream=True)
filename = os.path.join(os.getcwd(), url.split('/')[-1])
with open(filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
print(f"File '{filename}' download complete.")
df = pd.read_parquet('v2ga_w_embeddings_half.parquet')
def cosine_similarity(v1, v2):
dot_product = np.dot(v1, v2)
v1_norm = np.linalg.norm(v1)
v2_norm = np.linalg.norm(v2)
if v1_norm == 0.0 or v2_norm == 0.0:
return np.nan
else:
similarity = dot_product / (v1_norm * v2_norm)
return similarity
def relevance_scores(query_embed,df,embeddings):
scores = [cosine_similarity(query_embed, v2) for v2 in df[embeddings]]
scores = pd.Series(scores)
# sort scores in descending order
scores = scores.sort_values(ascending=False)
# set first score to 0
scores.iloc[0] = 0
return(scores)
def semantic_search(artist, title):
chosen_song = df[(df['artist'] == artist) & (df['title'] == title)]
scores_glove = relevance_scores(chosen_song["embedding_glove"].values[0],df,"embedding_glove")
index_glove = scores_glove.idxmax()
result_glove = df.iloc[index_glove][['title', 'artist', 'lyrics']]
result_glove['lyrics'] = result_glove['lyrics'].replace('\n', '. ')
scores_minilm = relevance_scores(chosen_song["embedding_minilm"].values[0],df,"embedding_minilm")
index_minilm = scores_minilm.idxmax()
result_minilm = df.iloc[index_minilm][['title', 'artist', 'lyrics']]
result_minilm['lyrics'] = result_minilm['lyrics'].replace('\n', '. ')
scores_roberta = relevance_scores(chosen_song["embedding_roberta"].values[0],df,"embedding_roberta")
index_roberta = scores_roberta.idxmax()
result_roberta = df.iloc[index_roberta][['title', 'artist', 'lyrics']]
result_roberta['lyrics'] = result_roberta['lyrics'].replace('\n', '. ')
scores_gpt = relevance_scores(chosen_song["embedding_gpt"].values[0],df,"embedding_gpt")
index_gpt = scores_gpt.idxmax()
result_gpt = df.iloc[index_gpt][['title', 'artist', 'lyrics']]
result_gpt['lyrics'] = result_gpt['lyrics'].replace('\n', '. ')
chosen_song = chosen_song[['title', 'artist', 'lyrics']].iloc[0]
chosen_song['lyrics'] = chosen_song['lyrics'].replace('\n', '. ')
results = {
'chosen_song': chosen_song.to_dict(),
'glove': result_glove.to_dict(),
'minilm': result_minilm.to_dict(),
'roberta': result_roberta.to_dict(),
'gpt': result_gpt.to_dict()
}
return results
from gradio.components import Dropdown
artists = sorted(df['artist'].unique())
titles = sorted(df['title'].unique())
artist_dropdown = Dropdown(artists, label="Artist")
title_dropdown = Dropdown(titles, label="Title")
# 100 random examples
df_sample = df.sample(100)
sample_artists = df_sample['artist'].tolist()
sample_titles = df_sample['title'].tolist()
artist_title_sample = [[artist, titles] for artist, titles in zip(sample_artists, sample_titles)]
output_interface = gr.components.JSON(label="Similar Songs")
iface = gr.Interface(
fn=semantic_search,
inputs=[artist_dropdown, title_dropdown],
outputs=output_interface,
examples=artist_title_sample,
title="Similar Song Finder",
description="Find four similar songs to the selected song based on different embeddings (GloVe, MiniLM, RoBERTa, GPT)."
)
iface.launch()