TaylorSwiftDJ / scripts /data_setup.py
mchockal's picture
Upload 11 files
f1b477a
raw
history blame
3.84 kB
from dotenv import load_dotenv
load_dotenv(dotenv_path="../env_vars.env")
import json
from pathlib import Path
import os
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import DeepLake
from utils import clean_emotions_json
"""
This function takes all the songs we have and use the lyrics to create a list of 8 emotions.
These 8 emotions will then be used for similarity matching with user prompt emotions, instead of using the entire lyrics.
"""
def generate_emotion_from_lyrics(input_file:str, prompt_path:str, output_file:str, clean_output=True) -> None:
prompt = PromptTemplate(
input_variables=["song_lyrics"],
template=Path(prompt_path).read_text(),
)
llm = ChatOpenAI(temperature=0.8)
chain = LLMChain(llm=llm, prompt=prompt)
# Read file that has scraped lyrics.
with open(input_file, "r") as f:
lyrical_data = json.load(f)
'''
'song' looks like as follows
{
"song_name": "Cruel Summer",
"iframe": "<iframe style=\"border-radius: 12px\" width=\"100%\" height=\"152\" title=\"Spotify Embed:...
"lyrics": "Fever dream high in the quiet of the nightYou know that I caught it Bad...
}
'''
# Collect 8 common emotions conveyed in the songs using their lyrics
emotion_data = []
for song in lyrical_data:
print(f"{song['song_name']}")
emotions = chain.run(song_lyrics=song["lyrics"])
emotion_data.append(
{
"song_name": song["song_name"],
"iframe": song["iframe"],
"emotions": emotions
})
print(emotions)
# Write to output file which will be used to store the song emotions as embeddings
with open(output_file, "w") as f:
json.dump(emotion_data, f, indent=4)
print(f"Spotify song, url and song emotions saved to {output_file}")
# Clean the generated emotions
if clean_output:
clean_emotions_json("../data/spotify_song_url_emotions.json")
def create_db(dataset_path: str, input_file: str) -> DeepLake:
with open(input_file, "r") as f:
emotion_data = json.load(f)
texts = []
metadatas = []
'''
{
"song_name": "Mastermind",
"iframe": "<iframe style=\"border-radius: 12px\" width=\"100%\" height=\"152\" title=\"Spotify Embed:
"emotions": "excitement, happiness, love, desire, confidence, determination, vulnerability, manipulation"
}
'''
for song in emotion_data:
texts.append(song["emotions"])
metadatas.append(
{
"name": song["song_name"],
"iframe": song["iframe"],
}
)
embeddings = OpenAIEmbeddings(model=os.environ['MODEL'])
db = DeepLake.from_texts(
texts, embeddings, metadatas=metadatas, dataset_path=dataset_path
)
return db
def create_emotion_embeddings(input_file:str) ->None:
dataset_path = f"hub://{os.environ['ACTIVELOOP_ORG_ID']}/{os.environ['DATASET']}"
create_db(dataset_path, input_file)
if __name__ == "__main__":
# Get top 8 emotions for each song lyric that was scraped, using GPT 3.5 Turbo
prompt_path = "../prompts/get_song_emotions.prompt"
input_file = "../data/spotify_song_url_lyrics.json"
output_file = "../data/spotify_song_url_emotions.json"
generate_emotion_from_lyrics(input_file, prompt_path, output_file, clean_output=True)
# Convert the generated emotions to embeddings (using 'text-embedding-ada-002' model) and save them in a vector store(DeepLake)
input_file = "../data/spotify_song_url_emotions.json"
create_emotion_embeddings(input_file)