semantic-song-search

Runtime error

App Files Files Community

Shea commited on Apr 18, 2023

Commit

511e1bc

1 Parent(s): 5986780

update

Browse files

Files changed (3) hide show

app.ipynb +253 -0
app.py +70 -40
v2ga_w_embeddings_half.parquet +3 -0

app.ipynb ADDED Viewed

	@@ -0,0 +1,253 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import pyarrow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_parquet('v2ga_w_embeddings_half.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cosine_similarity(v1, v2):\n",
+    "    dot_product = np.dot(v1, v2)\n",
+    "    v1_norm = np.linalg.norm(v1)\n",
+    "    v2_norm = np.linalg.norm(v2)\n",
+    "    if v1_norm == 0.0 or v2_norm == 0.0:\n",
+    "        return np.nan\n",
+    "    else:\n",
+    "        similarity = dot_product / (v1_norm * v2_norm)\n",
+    "        return similarity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def relevance_scores(query_embed,df,embeddings):\n",
+    "    scores = [cosine_similarity(query_embed, v2) for v2 in df[embeddings]]\n",
+    "    scores = pd.Series(scores)\n",
+    "    # sort scores in descending order\n",
+    "    scores = scores.sort_values(ascending=False)\n",
+    "    # set first score to 0\n",
+    "    scores.iloc[0] = 0\n",
+    "    return(scores)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def semantic_search(artist, title):\n",
+    "\n",
+    "    chosen_song = df[(df['artist'] == artist) & (df['title'] == title)]\n",
+    "\n",
+    "    scores_glove = relevance_scores(chosen_song[\"embedding_glove\"].values[0],df,\"embedding_glove\")\n",
+    "    index_glove = scores_glove.idxmax()\n",
+    "    result_glove = df.iloc[index_glove][['title', 'artist', 'lyrics']]\n",
+    "    result_glove['lyrics'] = result_glove['lyrics'].replace('\\n', '. ')\n",
+    "\n",
+    "    scores_minilm = relevance_scores(chosen_song[\"embedding_minilm\"].values[0],df,\"embedding_minilm\")\n",
+    "    index_minilm = scores_minilm.idxmax()\n",
+    "    result_minilm = df.iloc[index_minilm][['title', 'artist', 'lyrics']]\n",
+    "    result_minilm['lyrics'] = result_minilm['lyrics'].replace('\\n', '. ')\n",
+    "\n",
+    "    scores_roberta = relevance_scores(chosen_song[\"embedding_roberta\"].values[0],df,\"embedding_roberta\")\n",
+    "    index_roberta = scores_roberta.idxmax()\n",
+    "    result_roberta = df.iloc[index_roberta][['title', 'artist', 'lyrics']]\n",
+    "    result_roberta['lyrics'] = result_roberta['lyrics'].replace('\\n', '. ')\n",
+    "\n",
+    "    scores_gpt = relevance_scores(chosen_song[\"embedding_gpt\"].values[0],df,\"embedding_gpt\")\n",
+    "    index_gpt = scores_gpt.idxmax()\n",
+    "    result_gpt = df.iloc[index_gpt][['title', 'artist', 'lyrics']]\n",
+    "    result_gpt['lyrics'] = result_gpt['lyrics'].replace('\\n', '. ')\n",
+    "\n",
+    "    chosen_song = chosen_song[['title', 'artist', 'lyrics']].iloc[0]\n",
+    "    chosen_song['lyrics'] = chosen_song['lyrics'].replace('\\n', '. ')\n",
+    "\n",
+    "    results = {\n",
+    "        'chosen_song': chosen_song.to_dict(),\n",
+    "        'glove': result_glove.to_dict(),\n",
+    "        'minilm': result_minilm.to_dict(),\n",
+    "        'roberta': result_roberta.to_dict(),\n",
+    "        'gpt': result_gpt.to_dict()\n",
+    "    }\n",
+    "\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'chosen_song': {'title': 'Century City',\n",
+       "  'artist': 'Tom Petty',\n",
+       "  'lyrics': \"Tom Petty\\nMiscellaneous\\nCentury City\\nSometimes I wanna leave here\\nSometimes I wanna go right back where I came from\\nBack where I belong\\nBut it never lasts for too long, it always goes away\\nAnd I still don't look for reasons\\nThat's much too hard these days\\nWhy worry 'bout the rain?\\nWhy worry 'bout the thunder?\\nCentury City's got everything covered\\nWell, your mama gave you lovin'\\nMama held you near\\nNow mama can't do nothin'\\nBaby, mama just ain't here\\nAnd you can pretend all you want to do\\nBut that won't work no more\\nAnd you can't run back to daddy\\nYou tried that once before\\nBut why worry 'bout your daddy?\\nWhy worry 'bout your mother?\\nCentury City's got everything covered\\nWe're gonna live in Century City\\nGo ahead and give in (Century City) like modern men\\nAnd modern girls, we're gonna live in the modern world\\nSometimes I get discouraged\\nSometimes I feel so down\\nSometimes I get so worried\\nAnd I don't know what about\\nBut it works out in the long run\\nIt always goes away\\nI've come now to accept it\\nAs a reoccurring phase\\nWhy worry 'bout the rain?\\nWhy worry 'bout the thunder?\\nCentury City's got everything covered\"},\n",
+       " 'glove': {'title': 'Visit',\n",
+       "  'artist': '311',\n",
+       "  'lyrics': \"He wouldn't say he cared at all if you asked him. You're heading for a fall brother it goes right past him. In another world he's in another place. You now the need for speed. Is just another form of greed. But when you jones'n your brains frozen. You're not thinkin' bout the choices you makin' you're just rollin'. On and on it's anybody's guess 'cause no one's at the wheel at the front of. That mess yes. . The sun's goin' down for me it's goin' down for me. He and she are what I need yes they're what I need. But I'm gonna buy a ticket I'm not gonna even pick it. I heard it he said it I heard it. But it we but it went something like. . Chorus :. Visit. I wanna visit the world. So now I visit the world. With my time on this world. Because livin' when you're hungry is a dog in an alley. Now I mind my business 'cause I'm rhymin' down in Cali. I ain't playin' when sayin' kings of the ghetto feel they losing ground. Vato's goin' loco because everyone's brought down. This is a visit then we dead fade to dust strickin'. I'm wearing my Doc Martens 'cause I'm always down for kickin'. This my friend the city pity everywhere the enemy. Ready or not homey stompin' everyone in front of me. . Jump up and down cus that's the 311 style. Cruise on by the frowners float follow me now with a smile. Looking at the ocean I say there's plenty. Looking at the river I say there's plenty. Chorus. . Jump up and down 'cause that's the 311 style. Cruise on by the frowners float follow me now with a smile. I got a golden ticket I'm not gonna even pick it. I heard it he said it I heard it. But it went but it went something like. Chorus. Chorus\"},\n",
+       " 'minilm': {'title': 'Leaving Town Blues take 5',\n",
+       "  'artist': 'Fleetwood Mac',\n",
+       "  'lyrics': \"Written by Peter Green. . Now, when I leave this town. Won't be - back no more. Yes, when I leave this town, mama. Won't be back no more. I've got the blues so bad. Standing 'round my door. . Yes, I got the blues so bad. Why won't you let me be. I have the blues so bad, mama. Why won't you let me be. Now, when I go to Chicago. Blues don't you follow me. . Break:. . I'm asking mama, mama. Please don't cry no tears. Yes, I say mama, mama. Please don't cry no tears. Because I'll always love you. Through all my days and years\"},\n",
+       " 'roberta': {'title': 'Someday Baby - Alternate Version Modern Times',\n",
+       "  'artist': 'Bob Dylan',\n",
+       "  'lyrics': \"I don't care what you do, don't care what you say. Don't care where you go, or how long you stay. Someday baby, you ain't gonna worry po' me any more. . You take my money and you turn me out. You fill me up with self-doubt. Someday baby, you ain't gonna worry po' me any more. . You made me eat a ton of dust. You're potentially dangerous and not worthy of trust. Someday baby, you ain't gonna worry po' me any more. . Little by little, bit by bit. Every day I'm becomin' more of a hypocrite. Someday baby, you ain't gonna worry po' me any more. . You've got my mind tied up in knots. I just keep recyclin' the same old thoughts. Someday baby, you ain't gonna worry po' me any more. . When I heard you was cold, I bought you a coat and hat. I think you must have forgotten 'bout that. Someday baby, you ain't gonna worry po' me any more. Gonna blow out your mind and make it pure. I've taken about as much of this as I can endure. Someday baby, you ain't gonna worry po' me any more. . You put me down from a haver creak. That's all right, to you i turn the other cheek. Someday baby, you ain't gonna worry po' me any more. . You say you need me, how would I know?. You say you love me, but it can't be so. Someday baby, you ain't gonna worry po' me any more. . I don't wanna brag, but I'll wring your neck. When all else fails, I'll make it a matter of self-respect. Someday baby, you ain't gonna worry po' me any more. . Livin' this way ain't a natural thing to do. Why was I born to love you?. Someday baby, you ain't gonna worry po' me any more\"},\n",
+       " 'gpt': {'title': 'Long Time Sunshine Reprise',\n",
+       "  'artist': 'Weezer',\n",
+       "  'lyrics': \"Sometimes I wanna pack it all up, get on a bus and move to Vermont. Or Maine, or any of those states back east that I remember. Sometimes I wanna go back to school, an east coast college with some history. I'd be satisfied, I know, in the simple things. Longtime sunshine. Longtime sunshine upon me. Sometime I wanna build a house with a wood stove or a fire place. In the middle of the living room an old piano. Sometimes it don't seem so bad to settle down with a good woman;. Leave this lonely life behind forever, and ever. Longtime sunshine. Longtime sunshine upon me. Goodbye friends, goodbye my girl, close my eyes as you fly away;. Keep on going 'til you get some place where you can truly rest. Longtime sunshine. Longtime sunshine upon me. Long time sunshine, long time sunshine upon me. Why bother? It's gonna hurt me, it's gonna kill when you desert me. He is in my eyes, he is in my ears, he is in my blood, he is in my tears. No, there is no other one, no, there is no other one, can't have any other one, even though I ???. Blast off! Up to the stars we go, and leave behind everything I use to know. Somebody's giving me a whole lot of money to do what I think I want to. So why am I still feeling blue? Oh Wuan and Dondo\"}}"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "semantic_search(\"Tom Petty\", \"Century City\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "title                                          Century City\n",
+       "artist                                            Tom Petty\n",
+       "lyrics    Tom Petty. Miscellaneous. Century City. Someti...\n",
+       "Name: 3833, dtype: object"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "artist = \"Tom Petty\"\n",
+    "title = \"Century City\"\n",
+    "\n",
+    "chosen_song = df[(df['artist'] == artist) & (df['title'] == title)]\n",
+    "chosen_song = chosen_song[['title', 'artist', 'lyrics']].iloc[0]\n",
+    "chosen_song['lyrics'] = chosen_song['lyrics'].replace('\\n', '. ')\n",
+    "\n",
+    "chosen_song"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>artist</th>\n",
+       "      <th>lyrics</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>3833</th>\n",
+       "      <td>Century City</td>\n",
+       "      <td>Tom Petty</td>\n",
+       "      <td>Tom Petty\\nMiscellaneous\\nCentury City\\nSometi...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             title     artist   \n",
+       "3833  Century City  Tom Petty  \\\n",
+       "\n",
+       "                                                 lyrics  \n",
+       "3833  Tom Petty\\nMiscellaneous\\nCentury City\\nSometi...  "
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chosen_song"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import gradio as gr
 import numpy as np
 import pandas as pd
 import pyarrow
-from sklearn.metrics.pairwise import cosine_similarity
 import os
 import requests
@@ -16,59 +16,89 @@ with open(filename, 'wb') as file:
         if chunk:
             file.write(chunk)
-print(f"File '{filename}' has been downloaded to the present working directory.")
-pwd = os.getcwd()
-print("Present Working Directory:", pwd)
-contents = os.listdir(pwd)
-print("Contents of the Directory:")
-for item in contents:
-    print(item)
 df = pd.read_parquet('v2ga_w_embeddings_half.parquet')
-def get_most_similar_songs(artist, title, df):
-    def find_most_similar(embedding_column):
-        chosen_song = df[(df['artist'] == artist) & (df['title'] == title)][embedding_column].values
-        if len(chosen_song) == 0:
-            return None
-        chosen_song = chosen_song.reshape(1, -1)
-        similarity_matrix = cosine_similarity(df[embedding_column].values.tolist(), chosen_song)
-        most_similar_indices = np.argsort(similarity_matrix.flatten())[-5:-1][::-1] # Top 4 excluding the selected song
-        return df.iloc[most_similar_indices][['title', 'artist', 'lyrics']].to_dict(orient='records')
-    results = {}
-    for embedding in ['embedding_glove', 'embedding_minilm', 'embedding_roberta', 'embedding_gpt']:
-        most_similar = find_most_similar(embedding)
-        if most_similar is None:
-            return "Song not found. Please ensure the artist and title are correct."
-        results[embedding] = most_similar
     return results
-def update_titles_dropdown(artist):
-    titles = sorted(df[df['artist'] == artist]['title'].unique())
-    return titles
 artists = sorted(df['artist'].unique())
-artist_dropdown = gr.inputs.Dropdown(artists, label="Artist")
-title_dropdown = gr.inputs.Dropdown([], label="Title", updatable=True)
-output_interface = gr.outputs.JSON(label="Similar Songs")
 iface = gr.Interface(
-    fn=get_most_similar_songs,
     inputs=[artist_dropdown, title_dropdown],
     outputs=output_interface,
-    examples=[("The Beatles", "Let It Be"), ("Eminem", "Lose Yourself")],
-    title="Semantic Song Search: Most Similar Song",
-    description="Find the 4 most similar songs to the selected song based on different embeddings (GloVe, MiniLM, RoBERTa, GPT).",
-    update=update_titles_dropdown
 )
 iface.launch()

 import gradio as gr
+from gradio import components
 import numpy as np
 import pandas as pd
 import pyarrow
 import os
 import requests
         if chunk:
             file.write(chunk)
+print(f"File '{filename}' download complete.")
 df = pd.read_parquet('v2ga_w_embeddings_half.parquet')
+def cosine_similarity(v1, v2):
+    dot_product = np.dot(v1, v2)
+    v1_norm = np.linalg.norm(v1)
+    v2_norm = np.linalg.norm(v2)
+    if v1_norm == 0.0 or v2_norm == 0.0:
+        return np.nan
+    else:
+        similarity = dot_product / (v1_norm * v2_norm)
+        return similarity
+def relevance_scores(query_embed,df,embeddings):
+    scores = [cosine_similarity(query_embed, v2) for v2 in df[embeddings]]
+    scores = pd.Series(scores)
+    # sort scores in descending order
+    scores = scores.sort_values(ascending=False)
+    # set first score to 0
+    scores.iloc[0] = 0
+    return(scores)
+def semantic_search(artist, title):
+    chosen_song = df[(df['artist'] == artist) & (df['title'] == title)]
+    scores_glove = relevance_scores(chosen_song["embedding_glove"].values[0],df,"embedding_glove")
+    index_glove = scores_glove.idxmax()
+    result_glove = df.iloc[index_glove][['title', 'artist', 'lyrics']]
+    result_glove['lyrics'] = result_glove['lyrics'].replace('\n', '. ')
+    scores_minilm = relevance_scores(chosen_song["embedding_minilm"].values[0],df,"embedding_minilm")
+    index_minilm = scores_minilm.idxmax()
+    result_minilm = df.iloc[index_minilm][['title', 'artist', 'lyrics']]
+    result_minilm['lyrics'] = result_minilm['lyrics'].replace('\n', '. ')
+    scores_roberta = relevance_scores(chosen_song["embedding_roberta"].values[0],df,"embedding_roberta")
+    index_roberta = scores_roberta.idxmax()
+    result_roberta = df.iloc[index_roberta][['title', 'artist', 'lyrics']]
+    result_roberta['lyrics'] = result_roberta['lyrics'].replace('\n', '. ')
+    scores_gpt = relevance_scores(chosen_song["embedding_gpt"].values[0],df,"embedding_gpt")
+    index_gpt = scores_gpt.idxmax()
+    result_gpt = df.iloc[index_gpt][['title', 'artist', 'lyrics']]
+    result_gpt['lyrics'] = result_gpt['lyrics'].replace('\n', '. ')
+    chosen_song = chosen_song[['title', 'artist', 'lyrics']].iloc[0]
+    chosen_song['lyrics'] = chosen_song['lyrics'].replace('\n', '. ')
+    results = {
+        'chosen_song': chosen_song.to_dict(),
+        'glove': result_glove.to_dict(),
+        'minilm': result_minilm.to_dict(),
+        'roberta': result_roberta.to_dict(),
+        'gpt': result_gpt.to_dict()
+    }
     return results
+from gradio.components import Dropdown
 artists = sorted(df['artist'].unique())
+titles = sorted(df['title'].unique())
+artist_dropdown = Dropdown(artists, label="Artist")
+title_dropdown = Dropdown(titles, label="Title")
+# 100 random examples
+df_sample = df.sample(100)
+sample_artists = df_sample['artist'].tolist()
+sample_titles = df_sample['title'].tolist()
+artist_title_sample = [[artist, titles] for artist, titles in zip(sample_artists, sample_titles)]
+output_interface = gr.components.JSON(label="Similar Songs")
 iface = gr.Interface(
+    fn=semantic_search,
     inputs=[artist_dropdown, title_dropdown],
     outputs=output_interface,
+    examples=artist_title_sample,
+    title="Similar Song Finder",
+    description="Find four similar songs to the selected song based on different embeddings (GloVe, MiniLM, RoBERTa, GPT)."
 )
 iface.launch()

v2ga_w_embeddings_half.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:405f8b1651052cc1b974530d76e767578f6d7d957f931d433446dc89096a7ef4
+size 133353339