import gradio as gr from gradio import components import numpy as np import pandas as pd import pyarrow import os import requests url = 'https://huggingface.co/datasets/sheacon/song_lyrics/resolve/main/v2ga_w_embeddings_half.parquet' response = requests.get(url, stream=True) filename = os.path.join(os.getcwd(), url.split('/')[-1]) with open(filename, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): if chunk: file.write(chunk) print(f"File '{filename}' download complete.") df = pd.read_parquet('v2ga_w_embeddings_half.parquet') def cosine_similarity(v1, v2): dot_product = np.dot(v1, v2) v1_norm = np.linalg.norm(v1) v2_norm = np.linalg.norm(v2) if v1_norm == 0.0 or v2_norm == 0.0: return np.nan else: similarity = dot_product / (v1_norm * v2_norm) return similarity def relevance_scores(query_embed,df,embeddings): scores = [cosine_similarity(query_embed, v2) for v2 in df[embeddings]] scores = pd.Series(scores) # sort scores in descending order scores = scores.sort_values(ascending=False) # set first score to 0 scores.iloc[0] = 0 return(scores) def semantic_search(artist, title): chosen_song = df[(df['artist'] == artist) & (df['title'] == title)] scores_glove = relevance_scores(chosen_song["embedding_glove"].values[0],df,"embedding_glove") index_glove = scores_glove.idxmax() result_glove = df.iloc[index_glove][['title', 'artist', 'lyrics']] result_glove['lyrics'] = result_glove['lyrics'].replace('\n', '. ') scores_minilm = relevance_scores(chosen_song["embedding_minilm"].values[0],df,"embedding_minilm") index_minilm = scores_minilm.idxmax() result_minilm = df.iloc[index_minilm][['title', 'artist', 'lyrics']] result_minilm['lyrics'] = result_minilm['lyrics'].replace('\n', '. ') scores_roberta = relevance_scores(chosen_song["embedding_roberta"].values[0],df,"embedding_roberta") index_roberta = scores_roberta.idxmax() result_roberta = df.iloc[index_roberta][['title', 'artist', 'lyrics']] result_roberta['lyrics'] = result_roberta['lyrics'].replace('\n', '. ') scores_gpt = relevance_scores(chosen_song["embedding_gpt"].values[0],df,"embedding_gpt") index_gpt = scores_gpt.idxmax() result_gpt = df.iloc[index_gpt][['title', 'artist', 'lyrics']] result_gpt['lyrics'] = result_gpt['lyrics'].replace('\n', '. ') chosen_song = chosen_song[['title', 'artist', 'lyrics']].iloc[0] chosen_song['lyrics'] = chosen_song['lyrics'].replace('\n', '. ') results = { 'chosen_song': chosen_song.to_dict(), 'glove': result_glove.to_dict(), 'minilm': result_minilm.to_dict(), 'roberta': result_roberta.to_dict(), 'gpt': result_gpt.to_dict() } return results from gradio.components import Dropdown artists = sorted(df['artist'].unique()) titles = sorted(df['title'].unique()) artist_dropdown = Dropdown(artists, label="Artist") title_dropdown = Dropdown(titles, label="Title") # 100 random examples df_sample = df.sample(100) sample_artists = df_sample['artist'].tolist() sample_titles = df_sample['title'].tolist() artist_title_sample = [[artist, titles] for artist, titles in zip(sample_artists, sample_titles)] output_interface = gr.components.JSON(label="Similar Songs") iface = gr.Interface( fn=semantic_search, inputs=[artist_dropdown, title_dropdown], outputs=output_interface, examples=artist_title_sample, title="Similar Song Finder", description="Find four similar songs to the selected song based on different embeddings (GloVe, MiniLM, RoBERTa, GPT)." ) iface.launch()