Shea
update
511e1bc
raw
history blame contribute delete
No virus
3.68 kB
import gradio as gr
from gradio import components
import numpy as np
import pandas as pd
import pyarrow
import os
import requests
url = 'https://huggingface.co/datasets/sheacon/song_lyrics/resolve/main/v2ga_w_embeddings_half.parquet'
response = requests.get(url, stream=True)
filename = os.path.join(os.getcwd(), url.split('/')[-1])
with open(filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
print(f"File '{filename}' download complete.")
df = pd.read_parquet('v2ga_w_embeddings_half.parquet')
def cosine_similarity(v1, v2):
dot_product = np.dot(v1, v2)
v1_norm = np.linalg.norm(v1)
v2_norm = np.linalg.norm(v2)
if v1_norm == 0.0 or v2_norm == 0.0:
return np.nan
else:
similarity = dot_product / (v1_norm * v2_norm)
return similarity
def relevance_scores(query_embed,df,embeddings):
scores = [cosine_similarity(query_embed, v2) for v2 in df[embeddings]]
scores = pd.Series(scores)
# sort scores in descending order
scores = scores.sort_values(ascending=False)
# set first score to 0
scores.iloc[0] = 0
return(scores)
def semantic_search(artist, title):
chosen_song = df[(df['artist'] == artist) & (df['title'] == title)]
scores_glove = relevance_scores(chosen_song["embedding_glove"].values[0],df,"embedding_glove")
index_glove = scores_glove.idxmax()
result_glove = df.iloc[index_glove][['title', 'artist', 'lyrics']]
result_glove['lyrics'] = result_glove['lyrics'].replace('\n', '. ')
scores_minilm = relevance_scores(chosen_song["embedding_minilm"].values[0],df,"embedding_minilm")
index_minilm = scores_minilm.idxmax()
result_minilm = df.iloc[index_minilm][['title', 'artist', 'lyrics']]
result_minilm['lyrics'] = result_minilm['lyrics'].replace('\n', '. ')
scores_roberta = relevance_scores(chosen_song["embedding_roberta"].values[0],df,"embedding_roberta")
index_roberta = scores_roberta.idxmax()
result_roberta = df.iloc[index_roberta][['title', 'artist', 'lyrics']]
result_roberta['lyrics'] = result_roberta['lyrics'].replace('\n', '. ')
scores_gpt = relevance_scores(chosen_song["embedding_gpt"].values[0],df,"embedding_gpt")
index_gpt = scores_gpt.idxmax()
result_gpt = df.iloc[index_gpt][['title', 'artist', 'lyrics']]
result_gpt['lyrics'] = result_gpt['lyrics'].replace('\n', '. ')
chosen_song = chosen_song[['title', 'artist', 'lyrics']].iloc[0]
chosen_song['lyrics'] = chosen_song['lyrics'].replace('\n', '. ')
results = {
'chosen_song': chosen_song.to_dict(),
'glove': result_glove.to_dict(),
'minilm': result_minilm.to_dict(),
'roberta': result_roberta.to_dict(),
'gpt': result_gpt.to_dict()
}
return results
from gradio.components import Dropdown
artists = sorted(df['artist'].unique())
titles = sorted(df['title'].unique())
artist_dropdown = Dropdown(artists, label="Artist")
title_dropdown = Dropdown(titles, label="Title")
# 100 random examples
df_sample = df.sample(100)
sample_artists = df_sample['artist'].tolist()
sample_titles = df_sample['title'].tolist()
artist_title_sample = [[artist, titles] for artist, titles in zip(sample_artists, sample_titles)]
output_interface = gr.components.JSON(label="Similar Songs")
iface = gr.Interface(
fn=semantic_search,
inputs=[artist_dropdown, title_dropdown],
outputs=output_interface,
examples=artist_title_sample,
title="Similar Song Finder",
description="Find four similar songs to the selected song based on different embeddings (GloVe, MiniLM, RoBERTa, GPT)."
)
iface.launch()