Francesco commited on
Commit
e997328
1 Parent(s): 04242a9

first version with emotions

Browse files
.gitignore ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ disney-lyrics/
2
+ # Byte-compiled / optimized / DLL files
3
+ __pycache__/
4
+ *.py[cod]
5
+ *$py.class
6
+
7
+ # C extensions
8
+ *.so
9
+
10
+ # Distribution / packaging
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # poetry
99
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103
+ #poetry.lock
104
+
105
+ # pdm
106
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107
+ #pdm.lock
108
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109
+ # in version control.
110
+ # https://pdm.fming.dev/#use-with-ide
111
+ .pdm.toml
112
+
113
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114
+ __pypackages__/
115
+
116
+ # Celery stuff
117
+ celerybeat-schedule
118
+ celerybeat.pid
119
+
120
+ # SageMath parsed files
121
+ *.sage.py
122
+
123
+ # Environments
124
+ .env
125
+ .venv
126
+ env/
127
+ venv/
128
+ ENV/
129
+ env.bak/
130
+ venv.bak/
131
+
132
+ # Spyder project settings
133
+ .spyderproject
134
+ .spyproject
135
+
136
+ # Rope project settings
137
+ .ropeproject
138
+
139
+ # mkdocs documentation
140
+ /site
141
+
142
+ # mypy
143
+ .mypy_cache/
144
+ .dmypy.json
145
+ dmypy.json
146
+
147
+ # Pyre type checker
148
+ .pyre/
149
+
150
+ # pytype static type analyzer
151
+ .pytype/
152
+
153
+ # Cython debug symbols
154
+ cython_debug/
155
+
156
+ # PyCharm
157
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
160
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
161
+ #.idea/
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import streamlit as st
4
+ from dotenv import load_dotenv
5
+ from langchain.chains import LLMChain
6
+ from langchain.prompts import PromptTemplate
7
+
8
+ load_dotenv()
9
+ import os
10
+
11
+ from langchain.chat_models import ChatOpenAI
12
+ from langchain.embeddings.openai import OpenAIEmbeddings
13
+
14
+ from data import load_db
15
+ from names import DATASET_ID, MODEL_ID
16
+
17
+ @st.cache_resource
18
+ def init():
19
+ embeddings = OpenAIEmbeddings(model=MODEL_ID)
20
+ dataset_path = f"hub://{os.environ['ACTIVELOOP_ORG_ID']}/{DATASET_ID}"
21
+
22
+ db = load_db(
23
+ dataset_path,
24
+ embedding_function=embeddings,
25
+ token=os.environ["ACTIVELOOP_TOKEN"],
26
+ org_id=os.environ["ACTIVELOOP_ORG_ID"],
27
+ read_only=True,
28
+ )
29
+
30
+ prompt = PromptTemplate(
31
+ input_variables=["content"],
32
+ template=Path("prompts/bot.prompt").read_text(),
33
+ )
34
+
35
+ llm = ChatOpenAI(temperature=0.7)
36
+
37
+ chain = LLMChain(llm=llm, prompt=prompt)
38
+
39
+ return db, chain
40
+
41
+ db, chain = init()
42
+
43
+ st.title("Disney song for you")
44
+
45
+ text_input = st.text_input(
46
+ label="How are you feeling today?",
47
+ placeholder="I am ready to rock and rool!",
48
+ )
49
+
50
+ clicked = st.button("Click me")
51
+ placeholder_emotions = st.empty()
52
+ placeholder = st.empty()
53
+
54
+ def get_emotions(user_input):
55
+ emotions = chain.run(content=user_input)
56
+ print(f"Emotions: {emotions}")
57
+ matches = db.similarity_search_with_score(emotions, distance_metric="cos")
58
+ print(matches)
59
+ doc, score = matches[0]
60
+ iframes_html = ""
61
+ with placeholder_emotions:
62
+ st.write(emotions)
63
+ with placeholder:
64
+ embed_url = doc.metadata["embed_url"]
65
+ iframe_html = f'<iframe src="{embed_url}" style="border:0"> </iframe>'
66
+ st.components.v1.html(f"<div style='display:flex;flex-direction:column'>{iframe_html}</div>")
67
+
68
+
69
+ if clicked:
70
+ get_emotions(text_input)
data.py CHANGED
@@ -1,66 +1,48 @@
 
1
 
2
- # def get_lyrics_url_from_website():
3
- # # https://www.disneyclips.com/lyrics/
 
4
 
5
- import aiohttp
6
- import asyncio
7
- from bs4 import BeautifulSoup
8
 
9
- from typing import List, TypedDict, Tuple, Optional
10
 
11
- class Lyric(TypedDict):
12
- name: str
13
- text: str
14
 
15
- class Movie(TypedDict):
16
- title: str
17
- lyrics: List[Lyric]
18
 
 
 
19
 
20
- URL = "https://www.disneyclips.com/lyrics/"
 
 
 
 
 
 
 
 
 
21
 
 
22
 
23
- async def get_lyrics_urls_from_movie_url(url: str, session: aiohttp.ClientSession) -> Optional[Tuple[str, str]]:
24
- async with session.get(url) as response:
25
- html = await response.text()
26
- soup = BeautifulSoup(html, 'html.parser')
27
- table = soup.find('table', {'class': 'songs'})
28
- names_and_urls = None
29
- if table:
30
- links = table.find_all('a')
31
- names_and_urls = []
32
- for link in links:
33
- names_and_urls.append((link.text, f"{URL}/{link.get('href')}"))
34
- return names_and_urls
35
 
36
- async def get_lyric_from_lyric_url(url: str, name: str, session: aiohttp.ClientSession) -> Lyric:
37
- async with session.get(url) as response:
38
- html = await response.text()
39
- soup = BeautifulSoup(html, 'html.parser')
40
- div = soup.find('div', {'id': 'cnt'}).find('div', {'class': 'main'})
41
- paragraphs = div.find_all('p')
42
- text = ""
43
- for p in paragraphs:
44
- text += p.text
45
- return text
46
 
47
 
 
 
 
48
 
49
- async def get_movie_names_and_urls(session: aiohttp.ClientSession) -> List[Tuple[str, str]]:
50
- async with session.get(URL) as response:
51
- html = await response.text()
52
- soup = BeautifulSoup(html, 'html.parser')
53
- links = soup.find('div', {'id': 'cnt'}).find('div', {'class': 'main'}).find_all('a')
54
- movie_names_and_urls = [(link.text, f"{URL}/{link.get('href')}") for link in links]
55
- return movie_names_and_urls
56
-
57
 
58
-
59
-
60
- async def main():
61
- async with aiohttp.ClientSession() as session:
62
- names_and_urls = await get_movie_names_and_urls(session)
63
- data = await asyncio.gather(*[asyncio.create_task(get_lyrics_urls_from_movie_url(names, url, session)) for (names, url) in names_and_urls])
64
-
65
- loop = asyncio.get_event_loop()
66
- loop.run_until_complete(main())
 
1
+ from dotenv import load_dotenv
2
 
3
+ load_dotenv()
4
+ import json
5
+ import os
6
 
7
+ from langchain.embeddings.openai import OpenAIEmbeddings
8
+ from langchain.llms import OpenAI
9
+ from langchain.vectorstores import DeepLake
10
 
11
+ from names import DATASET_ID, MODEL_ID
12
 
 
 
 
13
 
14
+ def create_db(dataset_path: str, json_filepath: str) -> DeepLake:
15
+ with open(json_filepath, "r") as f:
16
+ data = json.load(f)
17
 
18
+ texts = []
19
+ metadatas = []
20
 
21
+ for movie, lyrics in data.items():
22
+ for lyric in lyrics:
23
+ texts.append(lyric["text"])
24
+ metadatas.append(
25
+ {
26
+ "movie": movie,
27
+ "name": lyric["name"],
28
+ "embed_url": lyric["embed_url"],
29
+ }
30
+ )
31
 
32
+ embeddings = OpenAIEmbeddings(model=MODEL_ID)
33
 
34
+ db = DeepLake.from_texts(
35
+ texts, embeddings, metadatas=metadatas, dataset_path=dataset_path
36
+ )
 
 
 
 
 
 
 
 
 
37
 
38
+ return db
 
 
 
 
 
 
 
 
 
39
 
40
 
41
+ def load_db(dataset_path: str, *args, **kwargs) -> DeepLake:
42
+ db = DeepLake(dataset_path, *args, **kwargs)
43
+ return db
44
 
 
 
 
 
 
 
 
 
45
 
46
+ if __name__ == "__main__":
47
+ dataset_path = f"hub://{os.environ['ACTIVELOOP_ORG_ID']}/{DATASET_ID}"
48
+ create_db(dataset_path, "data/lyrics_with_spotify_url.json")
 
 
 
 
 
 
data/lyrics.json ADDED
The diff for this file is too large to render. See raw diff
 
data/lyrics_with_spotify_url.json ADDED
The diff for this file is too large to render. See raw diff
 
data/lyrics_with_spotify_url_and_summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"friend like me": {"summary": "SUMMARY: The song is about the power of friendship and the willingness to help others, with a focus on the magical abilities of the genie and the endless possibilities that come with his assistance.", "embed_url": "https://open.spotify.com/embed/track/5f2TWu6R2YYCJtLQ0fP78H?utm_source=generator"}, "arabian nights": {"summary": "SUMMARY: The song evokes a sense of adventure and exoticism, with themes of home, heat, and the allure of Arabian nights.", "embed_url": "https://open.spotify.com/embed/track/0CKmN3Wwk8W4zjU0pqq2cv?utm_source=generator"}, "a whole new world": {"summary": "SUMMARY: The song is about the excitement and wonder of discovering a new world with someone you love, and the feeling of limitless possibilities that come with it.", "embed_url": "https://open.spotify.com/embed/track/1hwdPQtFHISvZ9SXMkNrIK?utm_source=generator"}, "one jump ahead": {"summary": "SUMMARY: Aladdin sings about his struggles as a street rat, constantly having to stay one step ahead of the law and society's expectations, while relying on his friendship with Abu to survive.", "embed_url": "https://open.spotify.com/embed/track/4wN8Ov3kPZdkJ8XcYxYUGz?utm_source=generator"}}
data/spotify_disney_songs.json ADDED
The diff for this file is too large to render. See raw diff
 
embeddings.npy ADDED
Binary file (24.7 kB). View file
 
names.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ MODEL_ID = "text-embedding-ada-002"
2
+ DATASET_ID = "disney-lyrics"
playground.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+
3
+ load_dotenv()
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+
8
+ import deeplake
9
+ import numpy as np
10
+ import openai
11
+
12
+ # https://www.disneyclips.com/lyrics/
13
+ DATASET_NAME = "disney-lyrics"
14
+ model_id = "text-embedding-ada-002"
15
+ dataset_path = f"hub://{os.environ['ACTIVELOOP_ORG_ID']}/{DATASET_NAME}"
16
+ print(dataset_path)
17
+ runtime = {"db_engine": True}
18
+
19
+ with open("lyrics.json", "rb") as f:
20
+ lyrics = json.load(f)["lyrics"]
21
+
22
+ # embeddings = [el["embedding"] for el in openai.Embedding.create(input=lyrics, model=model_id)['data']]
23
+
24
+ # embeddings_np = np.array(embeddings)
25
+ # np.save("embeddings.npy", embeddings_np)
26
+
27
+ embeddings_np = np.load("embeddings.npy")
28
+
29
+ print(embeddings_np.shape)
30
+
31
+
32
+ # ds = deeplake.empty(dataset_path, runtime=runtime, overwrite=True)
33
+
34
+ # # https://docs.deeplake.ai/en/latest/Htypes.html
35
+ # with ds:
36
+ # ds.create_tensor("embedding", htype="embedding", dtype=np.float32, exist_ok=True)
37
+ # ds.extend({ "embedding": embeddings_np.astype(np.float32)})
38
+ # ds.summary()
39
+
40
+ search_term = "Let's get down to business"
41
+
42
+ embedding = openai.Embedding.create(input=search_term, model="text-embedding-ada-002")[
43
+ "data"
44
+ ][0]["embedding"]
45
+
46
+ # Format the embedding as a string, so it can be passed in the REST API request.
47
+ embedding_search = ",".join([str(item) for item in embedding])
48
+
49
+ # embedding_search = ",".join([str(item) for item in embeddings_np[0].tolist()])
50
+ # print(embedding_search)
51
+
52
+
53
+ ds = deeplake.load(dataset_path)
54
+
55
+ # print(embedding_search)
56
+ query = f'select * from (select l2_norm(embedding - ARRAY[{embedding_search}]) as score from "{dataset_path}") order by score desc limit 5'
57
+ with open("foo.txt", "w") as f:
58
+ f.write(query)
59
+ query_res = ds.query(query)
60
+ print(query_res)
prompts/bot.prompt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ We have a simple song retrieval system. It accepts a max of 4 emotions. You are tasked to suggest emotions to match the users feelings. Let me show you a couple of examples
2
+
3
+ Input: "I had a great day!"
4
+ Output: "Joy and Energy"
5
+ Input: "I am very tired today and I am not feeling weel"
6
+ Output: "Exhaustion, Discomfort, and Fatigue"
7
+
8
+ If the sentence is too short, you can also suggest just one or two emotions.
9
+ Please, suggest emotions for input = "{content}", reply ONLY with a max of FOUR emotions.
prompts/bot_with_summary.prompt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Given the following list of songs:
2
+
3
+ {songs}
4
+
5
+ Given an user input. Output the song name, ONLY THE SONG NAME, that will be appropriate with the user feelings/emotions.
6
+
7
+ For example:
8
+ Input: "Today I am not feeling great"
9
+ <SONG_NAME>
10
+
11
+ The user input is "{user_input}", reply only with the song name
prompts/summary.prompt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ This is a disney song, can you output me a string with a one sentence summary of the themes/emotions of the song? Be specific, we will use the emotions/themes as keywords to search later. JUST the summary, not an introduction.
2
+
3
+ examples
4
+ INPUT: <SONG>
5
+ OUTPUT: <SUMMARY>
6
+
7
+ {song}
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ torch==2.0.1
3
+ torchvision
4
+ python-dotenv
5
+ deeplake
6
+ langchain
7
+ tiktoken
8
+ aiohttp
9
+ cchardet
10
+ aiodns
11
+ streamlit
scrape.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # def get_lyrics_url_from_website():
2
+ # # https://www.disneyclips.com/lyrics/
3
+
4
+ import asyncio
5
+ import json
6
+ from collections import defaultdict
7
+ from itertools import chain
8
+ from typing import List, Optional, Tuple, TypedDict
9
+
10
+ import aiohttp
11
+ from bs4 import BeautifulSoup
12
+
13
+ URL = "https://www.disneyclips.com/lyrics/"
14
+
15
+
16
+ async def get_lyrics_names_and_urls_from_movie_url(
17
+ movie_name: str, url: str, session: aiohttp.ClientSession
18
+ ) -> List[Tuple[str, str]]:
19
+ async with session.get(url) as response:
20
+ html = await response.text()
21
+ soup = BeautifulSoup(html, "html.parser")
22
+ table = soup.find("table", {"class": "songs"})
23
+ names_and_urls = []
24
+ if table:
25
+ links = table.find_all("a")
26
+ names_and_urls = []
27
+ for link in links:
28
+ names_and_urls.append(
29
+ (movie_name, link.text, f"{URL}/{link.get('href')}")
30
+ )
31
+ return names_and_urls
32
+
33
+
34
+ async def get_lyric_from_lyric_url(
35
+ movie_name: str, lyric_name: str, url: str, session: aiohttp.ClientSession
36
+ ) -> str:
37
+ async with session.get(url) as response:
38
+ html = await response.text()
39
+ soup = BeautifulSoup(html, "html.parser")
40
+ div = soup.find("div", {"id": "cnt"}).find("div", {"class": "main"})
41
+ paragraphs = div.find_all("p")
42
+ text = ""
43
+ # first <p> has the lyric
44
+ p = paragraphs[0]
45
+ for br in p.find_all("br"):
46
+ br.replace_with(". ")
47
+ for span in p.find_all("span"):
48
+ span.decompose()
49
+ text += p.text
50
+
51
+ return (movie_name, lyric_name, text)
52
+
53
+
54
+ async def get_movie_names_and_urls(
55
+ session: aiohttp.ClientSession,
56
+ ) -> List[Tuple[str, str]]:
57
+ async with session.get(URL) as response:
58
+ html = await response.text()
59
+ soup = BeautifulSoup(html, "html.parser")
60
+ links = (
61
+ soup.find("div", {"id": "cnt"}).find("div", {"class": "main"}).find_all("a")
62
+ )
63
+ movie_names_and_urls = [
64
+ (link.text, f"{URL}/{link.get('href')}") for link in links
65
+ ]
66
+ return movie_names_and_urls
67
+
68
+
69
+ async def scrape_disney_lyrics():
70
+ async with aiohttp.ClientSession() as session:
71
+ data = await get_movie_names_and_urls(session)
72
+ data = await asyncio.gather(
73
+ *[
74
+ asyncio.create_task(
75
+ get_lyrics_names_and_urls_from_movie_url(*el, session)
76
+ )
77
+ for el in data
78
+ ]
79
+ )
80
+ data = await asyncio.gather(
81
+ *[
82
+ asyncio.create_task(get_lyric_from_lyric_url(*data, session))
83
+ for data in chain(*data)
84
+ ]
85
+ )
86
+
87
+ result = defaultdict(list)
88
+
89
+ for movie_name, lyric_name, lyric_text in data:
90
+ result[movie_name].append({"name": lyric_name, "text": lyric_text})
91
+
92
+ with open("data/lyrics.json", "w") as f:
93
+ json.dump(result, f)
94
+
95
+
96
+ loop = asyncio.get_event_loop()
97
+ loop.run_until_complete(scrape_disney_lyrics())
scripts/create_one_sentence_summary.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+
4
+ from langchain.chains import LLMChain
5
+ from langchain.prompts import PromptTemplate
6
+ from pathlib import Path
7
+ from langchain.chat_models import ChatOpenAI
8
+ import json
9
+ from collections import defaultdict
10
+ from pprint import pprint
11
+
12
+ prompt = PromptTemplate(
13
+ input_variables=["song"],
14
+ template=Path("prompts/summary.prompt").read_text(),
15
+ )
16
+
17
+ llm = ChatOpenAI(temperature=0)
18
+
19
+ chain = LLMChain(llm=llm, prompt=prompt)
20
+
21
+ with open("/home/zuppif/Documents/Work/ActiveLoop/ai-shazam/data/lyrics_with_spotify_url.json", "r") as f:
22
+ data = json.load(f)
23
+
24
+ lyrics_summaries = {}
25
+
26
+ for movie, lyrics in data.items():
27
+ for lyric in lyrics:
28
+ print(f"Creating summary for {lyric['name']}")
29
+ summary = chain.run(song=lyric['text'])
30
+ lyrics_summaries[lyric['name'].lower()] = {"summary": summary, "embed_url": lyric["embed_url"] }
31
+ break
32
+
33
+ with open("/home/zuppif/Documents/Work/ActiveLoop/ai-shazam/data/lyrics_with_spotify_url_and_summary.json", "w") as f:
34
+ json.dump(lyrics_summaries, f)
35
+
36
+ pprint(lyrics_summaries)
scripts/keep_only_lyrics_on_spotify.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This script will keep only the lyrics that are in the Spotify "Disney Hits" playlist
2
+ """
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ import json
7
+
8
+ import spotipy
9
+ from spotipy.oauth2 import SpotifyClientCredentials
10
+
11
+ name = "Disney hits"
12
+
13
+ spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials())
14
+ results = spotify.search(q="playlist:" + name, type="playlist", limit=5)
15
+ items = results["playlists"]["items"]
16
+
17
+ uri = "spotify:playlist:37i9dQZF1DX8C9xQcOrE6T"
18
+ playlist = spotify.playlist(uri)
19
+
20
+ # with open("spotify_disney_songs.json", "w") as f:
21
+ # json.dump(playlist,f)
22
+
23
+
24
+ with open("data/lyrics.json", "r") as f:
25
+ data = json.load(f)
26
+
27
+ spotify_tracks = {}
28
+
29
+ for item in playlist["tracks"]["items"]:
30
+ track = item["track"]
31
+ track_name = track["name"].lower().split("-")[0].strip()
32
+ print(track_name)
33
+ spotify_tracks[track_name] = {
34
+ "id": track["id"],
35
+ "embed_url": f"https://open.spotify.com/embed/track/{track['id']}?utm_source=generator",
36
+ }
37
+
38
+ # here we add only songs that are in the Disney spotify playlist
39
+ from collections import defaultdict
40
+
41
+ data_filtered = defaultdict(list)
42
+ tot = 0
43
+ for movie, lyrics in data.items():
44
+ for lyric in lyrics:
45
+ name = lyric["name"].lower()
46
+ if name in spotify_tracks:
47
+ data_filtered[movie].append({**lyric, **{ 'embed_url' : spotify_tracks[name]['embed_url']}})
48
+ tot += 1
49
+ print(tot)
50
+
51
+ with open("data/lyrics_with_spotify_url.json", "w") as f:
52
+ json.dump(data_filtered, f)
temp.ipynb ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "26b62e0c",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "%load_ext autoreload\n",
11
+ "%autoreload "
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "b1a6a020",
18
+ "metadata": {
19
+ "scrolled": true
20
+ },
21
+ "outputs": [
22
+ {
23
+ "name": "stderr",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "/home/zuppif/miniconda3/envs/activeloop/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.4.3) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
27
+ " warnings.warn(\n",
28
+ "-"
29
+ ]
30
+ },
31
+ {
32
+ "name": "stdout",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/zuppif/disney-lyrics\n",
36
+ "\n"
37
+ ]
38
+ },
39
+ {
40
+ "name": "stderr",
41
+ "output_type": "stream",
42
+ "text": [
43
+ "|"
44
+ ]
45
+ },
46
+ {
47
+ "name": "stdout",
48
+ "output_type": "stream",
49
+ "text": [
50
+ "hub://zuppif/disney-lyrics loaded successfully.\n",
51
+ "\n",
52
+ "Deep Lake Dataset in hub://zuppif/disney-lyrics already exists, loading from the storage\n",
53
+ "Dataset(path='hub://zuppif/disney-lyrics', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])\n",
54
+ "\n",
55
+ " tensor htype shape dtype compression\n",
56
+ " ------- ------- ------- ------- ------- \n",
57
+ " embedding generic (85, 1536) float32 None \n",
58
+ " ids text (85, 1) str None \n",
59
+ " metadata json (85, 1) str None \n",
60
+ " text text (85, 1) str None \n"
61
+ ]
62
+ },
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "\r",
68
+ " \r",
69
+ "\r",
70
+ " \r"
71
+ ]
72
+ }
73
+ ],
74
+ "source": [
75
+ "from dotenv import load_dotenv\n",
76
+ "load_dotenv() \n",
77
+ "from names import DATASET_ID, MODEL_ID\n",
78
+ "from data import load_db\n",
79
+ "import os\n",
80
+ "from langchain.chains import RetrievalQA, ConversationalRetrievalChain\n",
81
+ "from langchain.vectorstores import DeepLake\n",
82
+ "from langchain.llms import OpenAI\n",
83
+ "from langchain.embeddings.openai import OpenAIEmbeddings\n",
84
+ "from langchain.chat_models import ChatOpenAI\n",
85
+ "\n",
86
+ "embeddings = OpenAIEmbeddings(model=MODEL_ID)\n",
87
+ "dataset_path = f\"hub://{os.environ['ACTIVELOOP_ORG_ID']}/{DATASET_ID}\"\n",
88
+ "\n",
89
+ "db = load_db(dataset_path, embedding_function=embeddings, token=os.environ['ACTIVELOOP_TOKEN'], org_id=os.environ[\"ACTIVELOOP_ORG_ID\"], read_only=True)"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 80,
95
+ "id": "07d8a381",
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "from langchain.chains import LLMChain\n",
100
+ "from langchain.prompts import PromptTemplate\n",
101
+ "from pathlib import Path\n",
102
+ "\n",
103
+ "prompt = PromptTemplate(\n",
104
+ " input_variables=[\"content\"],\n",
105
+ " template=Path(\"prompts/bot.prompt\").read_text(),\n",
106
+ ")\n",
107
+ "\n",
108
+ "llm = ChatOpenAI(temperature=0.7)\n",
109
+ "\n",
110
+ "chain = LLMChain(llm=llm, prompt=prompt)"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 81,
116
+ "id": "ebca722d",
117
+ "metadata": {},
118
+ "outputs": [
119
+ {
120
+ "data": {
121
+ "text/plain": [
122
+ "'Melancholy, Coziness, Nostalgia, Calmness.'"
123
+ ]
124
+ },
125
+ "execution_count": 81,
126
+ "metadata": {},
127
+ "output_type": "execute_result"
128
+ }
129
+ ],
130
+ "source": [
131
+ "emotions = chain.run(content=\"It's rainy\")\n",
132
+ "emotions"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 84,
138
+ "id": "9598a36c",
139
+ "metadata": {
140
+ "scrolled": false
141
+ },
142
+ "outputs": [
143
+ {
144
+ "data": {
145
+ "text/plain": [
146
+ "'https://open.spotify.com/embed/track/5EeQQ8BVJTRkp1AIKJILGY?utm_source=generator'"
147
+ ]
148
+ },
149
+ "execution_count": 84,
150
+ "metadata": {},
151
+ "output_type": "execute_result"
152
+ }
153
+ ],
154
+ "source": [
155
+ "doc, score = db.similarity_search_with_score(emotions, distance_metric=\"cos\")[0]\n",
156
+ "doc.metadata[\"embed_url\"]"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": 83,
162
+ "id": "d6214e40",
163
+ "metadata": {},
164
+ "outputs": [
165
+ {
166
+ "data": {
167
+ "text/html": [
168
+ "\n",
169
+ " <iframe\n",
170
+ " width=\"700\"\n",
171
+ " height=\"350\"\n",
172
+ " src=\"https://open.spotify.com/embed/track/5EeQQ8BVJTRkp1AIKJILGY?utm_source=generator\"\n",
173
+ " frameborder=\"0\"\n",
174
+ " allowfullscreen\n",
175
+ " \n",
176
+ " ></iframe>\n",
177
+ " "
178
+ ],
179
+ "text/plain": [
180
+ "<IPython.lib.display.IFrame at 0x7fb0be920a00>"
181
+ ]
182
+ },
183
+ "execution_count": 83,
184
+ "metadata": {},
185
+ "output_type": "execute_result"
186
+ }
187
+ ],
188
+ "source": [
189
+ "doc.metadata[\"embed_url\"]\n",
190
+ "\n",
191
+ "from IPython.display import IFrame\n",
192
+ "IFrame(doc.metadata[\"embed_url\"], width=700, height=350)"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "execution_count": 4,
198
+ "id": "28ae2c63",
199
+ "metadata": {
200
+ "scrolled": true
201
+ },
202
+ "outputs": [
203
+ {
204
+ "data": {
205
+ "text/plain": [
206
+ "Dataset(path='hub://zuppif/disney-lyrics', read_only=True, index=Index([()]), tensors=['embedding', 'ids', 'metadata', 'text'])"
207
+ ]
208
+ },
209
+ "execution_count": 4,
210
+ "metadata": {},
211
+ "output_type": "execute_result"
212
+ }
213
+ ],
214
+ "source": [
215
+ "db.ds.query(\"select * where contains(\\\"text\\\", 'Did they') limit 2\")"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": null,
221
+ "id": "1780552c",
222
+ "metadata": {},
223
+ "outputs": [],
224
+ "source": []
225
+ }
226
+ ],
227
+ "metadata": {
228
+ "kernelspec": {
229
+ "display_name": "Python 3 (ipykernel)",
230
+ "language": "python",
231
+ "name": "python3"
232
+ },
233
+ "language_info": {
234
+ "codemirror_mode": {
235
+ "name": "ipython",
236
+ "version": 3
237
+ },
238
+ "file_extension": ".py",
239
+ "mimetype": "text/x-python",
240
+ "name": "python",
241
+ "nbconvert_exporter": "python",
242
+ "pygments_lexer": "ipython3",
243
+ "version": "3.9.16"
244
+ }
245
+ },
246
+ "nbformat": 4,
247
+ "nbformat_minor": 5
248
+ }