MorenoLaQuatra
commited on
Commit
·
1f0b3af
1
Parent(s):
c0ec6ae
initial commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +0 -0
- app.py +442 -0
- full_metadata.csv +3 -0
- sample_mp3/000153.mp3 +3 -0
- sample_mp3/000206.mp3 +3 -0
- sample_mp3/000787.mp3 +3 -0
- sample_mp3/000913.mp3 +3 -0
- sample_mp3/001123.mp3 +3 -0
- sample_mp3/001280.mp3 +3 -0
- sample_mp3/001360.mp3 +3 -0
- sample_mp3/001550.mp3 +3 -0
- sample_mp3/001662.mp3 +3 -0
- sample_mp3/001713.mp3 +3 -0
- sample_mp3/001718.mp3 +3 -0
- sample_mp3/001732.mp3 +3 -0
- sample_mp3/001809.mp3 +3 -0
- sample_mp3/001935.mp3 +3 -0
- sample_mp3/003258.mp3 +3 -0
- sample_mp3/003357.mp3 +3 -0
- sample_mp3/003459.mp3 +3 -0
- sample_mp3/003505.mp3 +3 -0
- sample_mp3/003685.mp3 +3 -0
- sample_mp3/003911.mp3 +3 -0
- sample_mp3/003981.mp3 +3 -0
- sample_mp3/004183.mp3 +3 -0
- sample_mp3/004315.mp3 +3 -0
- sample_mp3/004539.mp3 +3 -0
- sample_mp3/004753.mp3 +3 -0
- sample_mp3/004909.mp3 +3 -0
- sample_mp3/005220.mp3 +3 -0
- sample_mp3/005346.mp3 +3 -0
- sample_mp3/005619.mp3 +3 -0
- sample_mp3/006464.mp3 +3 -0
- sample_mp3/007429.mp3 +3 -0
- sample_mp3/007537.mp3 +3 -0
- sample_mp3/007552.mp3 +3 -0
- sample_mp3/007559.mp3 +3 -0
- sample_mp3/007937.mp3 +3 -0
- sample_mp3/008140.mp3 +3 -0
- sample_mp3/008168.mp3 +3 -0
- sample_mp3/008370.mp3 +3 -0
- sample_mp3/008971.mp3 +3 -0
- sample_mp3/009210.mp3 +3 -0
- sample_mp3/009804.mp3 +3 -0
- sample_mp3/009955.mp3 +3 -0
- sample_mp3/010035.mp3 +3 -0
- sample_mp3/010109.mp3 +3 -0
- sample_mp3/010212.mp3 +3 -0
- sample_mp3/010259.mp3 +3 -0
- sample_mp3/010713.mp3 +3 -0
.gitattributes
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from st_btn_select import st_btn_select
|
5 |
+
from streamlit_option_menu import option_menu
|
6 |
+
|
7 |
+
from cgi import test
|
8 |
+
import streamlit as st
|
9 |
+
import pandas as pd
|
10 |
+
from PIL import Image
|
11 |
+
import os
|
12 |
+
import glob
|
13 |
+
|
14 |
+
from transformers import CLIPVisionModel, AutoTokenizer, AutoModel
|
15 |
+
from transformers import ViTFeatureExtractor, ViTForImageClassification
|
16 |
+
|
17 |
+
import torch
|
18 |
+
from tqdm import tqdm
|
19 |
+
from PIL import Image
|
20 |
+
import numpy as np
|
21 |
+
from torch.utils.data import DataLoader
|
22 |
+
from transformers import default_data_collator
|
23 |
+
|
24 |
+
from torch.utils.data import Dataset, DataLoader
|
25 |
+
import torchvision.transforms as transforms
|
26 |
+
|
27 |
+
from bokeh.models.widgets import Button
|
28 |
+
from bokeh.models import CustomJS
|
29 |
+
from streamlit_bokeh_events import streamlit_bokeh_events
|
30 |
+
|
31 |
+
from webcam import webcam
|
32 |
+
|
33 |
+
## Global Variables
|
34 |
+
MP3_ROOT_PATH = "samples_mp3/"
|
35 |
+
SPECTROGRAMS_PATH = "sample_spectrograms/"
|
36 |
+
|
37 |
+
IMAGE_SIZE = 224
|
38 |
+
MEAN = torch.tensor([0.48145466, 0.4578275, 0.40821073])
|
39 |
+
STD = torch.tensor([0.26862954, 0.26130258, 0.27577711])
|
40 |
+
|
41 |
+
TEXT_MODEL = 'bert-base-uncased'
|
42 |
+
|
43 |
+
CLIP_TEXT_MODEL_PATH = "text_model/"
|
44 |
+
CLIP_VISION_MODEL_PATH = "vision_model/"
|
45 |
+
|
46 |
+
## NavBar
|
47 |
+
def streamlit_menu(example=1):
|
48 |
+
if example == 1:
|
49 |
+
# 1. as sidebar menu
|
50 |
+
with st.sidebar:
|
51 |
+
selected = option_menu(
|
52 |
+
menu_title="Main Menu", # required
|
53 |
+
options=["Text", "Audio", "Camera"], # required
|
54 |
+
icons=["chat-text", "mic", "camera"], # optional
|
55 |
+
menu_icon="cast", # optional
|
56 |
+
default_index=0, # optional
|
57 |
+
)
|
58 |
+
return selected
|
59 |
+
|
60 |
+
if example == 2:
|
61 |
+
# 2. horizontal menu w/o custom style
|
62 |
+
selected = option_menu(
|
63 |
+
menu_title=None, # required
|
64 |
+
options=["Text", "Audio", "Camera"], # required
|
65 |
+
icons=["chat-text", "mic", "camera"], # optional
|
66 |
+
menu_icon="cast", # optional
|
67 |
+
default_index=0, # optional
|
68 |
+
orientation="horizontal",
|
69 |
+
)
|
70 |
+
return selected
|
71 |
+
|
72 |
+
if example == 3:
|
73 |
+
# 2. horizontal menu with custom style
|
74 |
+
selected = option_menu(
|
75 |
+
menu_title=None, # required
|
76 |
+
options=["Text", "Audio", "Camera"], # required
|
77 |
+
icons=["chat-text", "mic", "camera"], # optional
|
78 |
+
menu_icon="cast", # optional
|
79 |
+
default_index=0, # optional
|
80 |
+
orientation="horizontal",
|
81 |
+
styles={
|
82 |
+
"container": {"padding": "0!important", "background-color": "#fafafa"},
|
83 |
+
"icon": {"color": "#ffde59", "font-size": "25px"},
|
84 |
+
"nav-link": {
|
85 |
+
"font-size": "25px",
|
86 |
+
"text-align": "left",
|
87 |
+
"margin": "0px",
|
88 |
+
"--hover-color": "#eee",
|
89 |
+
},
|
90 |
+
"nav-link-selected": {"background-color": "#5271ff"},
|
91 |
+
},
|
92 |
+
)
|
93 |
+
return selected
|
94 |
+
|
95 |
+
|
96 |
+
## Draw Sidebar
|
97 |
+
def draw_sidebar(
|
98 |
+
key,
|
99 |
+
plot=False,
|
100 |
+
):
|
101 |
+
|
102 |
+
st.write(
|
103 |
+
"""
|
104 |
+
# Sidebar
|
105 |
+
|
106 |
+
```python
|
107 |
+
Think.
|
108 |
+
Search.
|
109 |
+
Feel.
|
110 |
+
```
|
111 |
+
"""
|
112 |
+
)
|
113 |
+
|
114 |
+
st.slider("From 1 to 10, how cool is this app?", min_value=1, max_value=10, key=key)
|
115 |
+
|
116 |
+
option = st_btn_select(('option1', 'option2', 'option3'), index=2)
|
117 |
+
st.write(f'Selected option: {option}')
|
118 |
+
|
119 |
+
## Change Color
|
120 |
+
#def change_color(styles="")
|
121 |
+
|
122 |
+
## VisionDataset
|
123 |
+
class VisionDataset(Dataset):
|
124 |
+
preprocess = transforms.Compose([
|
125 |
+
transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
|
126 |
+
transforms.ToTensor(),
|
127 |
+
transforms.Normalize(mean=MEAN, std=STD)
|
128 |
+
])
|
129 |
+
|
130 |
+
def __init__(self, image_paths: list):
|
131 |
+
self.image_paths = image_paths
|
132 |
+
|
133 |
+
def __getitem__(self, idx):
|
134 |
+
return self.preprocess(Image.open(self.image_paths[idx]).convert('RGB'))
|
135 |
+
|
136 |
+
def __len__(self):
|
137 |
+
return len(self.image_paths)
|
138 |
+
|
139 |
+
## TextDataset
|
140 |
+
class TextDataset(Dataset):
|
141 |
+
def __init__(self, text: list, tokenizer, max_len):
|
142 |
+
self.len = len(text)
|
143 |
+
self.tokens = tokenizer(text, padding='max_length',
|
144 |
+
max_length=max_len, truncation=True)
|
145 |
+
|
146 |
+
def __getitem__(self, idx):
|
147 |
+
token = self.tokens[idx]
|
148 |
+
return {'input_ids': token.ids, 'attention_mask': token.attention_mask}
|
149 |
+
|
150 |
+
def __len__(self):
|
151 |
+
return self.len
|
152 |
+
|
153 |
+
## CLIP Demo
|
154 |
+
class CLIPDemo:
|
155 |
+
def __init__(self, vision_encoder, text_encoder, tokenizer,
|
156 |
+
batch_size: int = 64, max_len: int = 64, device='cuda'):
|
157 |
+
""" Initializes CLIPDemo
|
158 |
+
it has the following functionalities:
|
159 |
+
image_search: Search images based on text query
|
160 |
+
zero_shot: Zero shot image classification
|
161 |
+
analogy: Analogies with embedding space arithmetic.
|
162 |
+
|
163 |
+
Args:
|
164 |
+
vision_encoder: Fine-tuned vision encoder
|
165 |
+
text_encoder: Fine-tuned text encoder
|
166 |
+
tokenizer: Transformers tokenizer
|
167 |
+
device (torch.device): Running device
|
168 |
+
batch_size (int): Size of mini-batches used to embeddings
|
169 |
+
max_length (int): Tokenizer max length
|
170 |
+
|
171 |
+
Example:
|
172 |
+
>>> demo = CLIPDemo(vision_encoder, text_encoder, tokenizer)
|
173 |
+
>>> demo.compute_image_embeddings(test_df.image.to_list())
|
174 |
+
>>> demo.image_search('یک مرد و یک زن')
|
175 |
+
>>> demo.zero_shot('./workers.jpg')
|
176 |
+
>>> demo.anology('./sunset.jpg', additional_text='دریا')
|
177 |
+
"""
|
178 |
+
self.vision_encoder = vision_encoder.eval().to(device)
|
179 |
+
self.text_encoder = text_encoder.eval().to(device)
|
180 |
+
self.batch_size = batch_size
|
181 |
+
self.device = device
|
182 |
+
self.tokenizer = tokenizer
|
183 |
+
self.max_len = max_len
|
184 |
+
self.text_embeddings_ = None
|
185 |
+
self.image_embeddings_ = None
|
186 |
+
|
187 |
+
|
188 |
+
def compute_image_embeddings(self, image_paths: list):
|
189 |
+
self.image_paths = image_paths
|
190 |
+
dataloader = DataLoader(VisionDataset(
|
191 |
+
image_paths=image_paths), batch_size=self.batch_size, num_workers=8)
|
192 |
+
embeddings = []
|
193 |
+
with torch.no_grad():
|
194 |
+
|
195 |
+
bar = st.progress(0)
|
196 |
+
for i, images in tqdm(enumerate(dataloader), desc='computing image embeddings'):
|
197 |
+
bar.progress(int(i/len(dataloader)*100))
|
198 |
+
image_embedding = self.vision_encoder(
|
199 |
+
pixel_values=images.to(self.device)).pooler_output
|
200 |
+
embeddings.append(image_embedding)
|
201 |
+
bar.empty()
|
202 |
+
self.image_embeddings_ = torch.cat(embeddings)
|
203 |
+
|
204 |
+
def compute_text_embeddings(self, text: list):
|
205 |
+
self.text = text
|
206 |
+
dataloader = DataLoader(TextDataset(text=text, tokenizer=self.tokenizer, max_len=self.max_len),
|
207 |
+
batch_size=self.batch_size, collate_fn=default_data_collator)
|
208 |
+
embeddings = []
|
209 |
+
with torch.no_grad():
|
210 |
+
for tokens in tqdm(dataloader, desc='computing text embeddings'):
|
211 |
+
image_embedding = self.text_encoder(input_ids=tokens["input_ids"].to(self.device),
|
212 |
+
attention_mask=tokens["attention_mask"].to(self.device)).pooler_output
|
213 |
+
embeddings.append(image_embedding)
|
214 |
+
self.text_embeddings_ = torch.cat(embeddings)
|
215 |
+
|
216 |
+
def text_query_embedding(self, query: str = 'A happy song'):
|
217 |
+
tokens = self.tokenizer(query, return_tensors='pt')
|
218 |
+
with torch.no_grad():
|
219 |
+
text_embedding = self.text_encoder(input_ids=tokens["input_ids"].to(self.device),
|
220 |
+
attention_mask=tokens["attention_mask"].to(self.device)).pooler_output
|
221 |
+
return text_embedding
|
222 |
+
|
223 |
+
def most_similars(self, embeddings_1, embeddings_2):
|
224 |
+
values, indices = torch.cosine_similarity(
|
225 |
+
embeddings_1, embeddings_2).sort(descending=True)
|
226 |
+
return values.cpu(), indices.cpu()
|
227 |
+
|
228 |
+
|
229 |
+
def image_search(self, query: str, top_k=10):
|
230 |
+
""" Search images based on text query
|
231 |
+
Args:
|
232 |
+
query (str): text query
|
233 |
+
image_paths (list[str]): a bunch of image paths
|
234 |
+
top_k (int): number of relevant images
|
235 |
+
"""
|
236 |
+
query_embedding = self.text_query_embedding(query=query)
|
237 |
+
_, indices = self.most_similars(self.image_embeddings_, query_embedding)
|
238 |
+
|
239 |
+
matches = np.array(self.image_paths)[indices][:top_k]
|
240 |
+
songs_path = []
|
241 |
+
for match in matches:
|
242 |
+
filename = os.path.split(match)[1]
|
243 |
+
filename = int(filename.replace(".jpeg", ""))
|
244 |
+
audio_path = MP3_ROOT_PATH + "/" + f"{filename:06d}"[0:3] + "/" + f"{filename:06d}"
|
245 |
+
songs_path.append(audio_path)
|
246 |
+
return songs_path
|
247 |
+
|
248 |
+
## Draw text page
|
249 |
+
def draw_text(
|
250 |
+
key,
|
251 |
+
plot=False,
|
252 |
+
):
|
253 |
+
|
254 |
+
image = Image.open("data/logo.png")
|
255 |
+
st.image(image, use_column_width="always")
|
256 |
+
|
257 |
+
if 'model' not in st.session_state:
|
258 |
+
#with st.spinner('We are orginizing your traks...'):
|
259 |
+
text_encoder = AutoModel.from_pretrained(CLIP_TEXT_MODEL_PATH, local_files_only=True)
|
260 |
+
vision_encoder = CLIPVisionModel.from_pretrained(CLIP_VISION_MODEL_PATH, local_files_only=True)
|
261 |
+
tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
|
262 |
+
model = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
|
263 |
+
model.compute_image_embeddings(glob.glob(SPECTROGRAMS_PATH + "/*.jpeg")[:1000])
|
264 |
+
st.session_state["model"] = model
|
265 |
+
|
266 |
+
|
267 |
+
""
|
268 |
+
""
|
269 |
+
|
270 |
+
moods = ['-', 'angry', 'calm', 'happy', 'sad']
|
271 |
+
genres = ['-', 'house', 'pop', 'rock', 'techno']
|
272 |
+
artists = ['-', 'bad dad', 'lazy magnet', 'the astronauts', 'yan yalego']
|
273 |
+
years = ['-', '80s', '90s', '2000s', '2010s']
|
274 |
+
|
275 |
+
col1, col2 = st.columns(2)
|
276 |
+
mood = col1.selectbox('Which mood do you feel right now?', moods, help="Select a mood here")
|
277 |
+
genre = col2.selectbox('Which genre do you want to listen?', genres, help="Select a genre here")
|
278 |
+
artist = col1.selectbox('Which artist do you like best?', artists, help="Select an artist here")
|
279 |
+
year = col2.selectbox('Which period do you want to relive?', years, help="Select a period here")
|
280 |
+
button_form = st.button('Search', key="button_form")
|
281 |
+
|
282 |
+
st.text_input("Otherwise, describe the song you are looking for!", value="", key="sentence")
|
283 |
+
button_sentence = st.button('Search', key="button_sentence")
|
284 |
+
|
285 |
+
if (button_sentence and st.session_state.sentence != "") or (button_form and not (mood == "-" and artist == "-" and genre == "-" and year == "-")):
|
286 |
+
if button_sentence:
|
287 |
+
sentence = st.session_state.sentence
|
288 |
+
elif button_form:
|
289 |
+
sentence = mood if mood != "-" else ""
|
290 |
+
sentence = sentence + " " + genre if genre != "-" else sentence
|
291 |
+
sentence = sentence + " " + artist if artist != "-" else sentence
|
292 |
+
sentence = sentence + " " + year if year != "-" else sentence
|
293 |
+
|
294 |
+
song_paths = st.session_state.model.image_search(sentence)
|
295 |
+
for song in song_paths:
|
296 |
+
song_name = df.loc[df['track_id'] == int(song[-6:])]['track_title'].to_list()[0]
|
297 |
+
artist_name = df.loc[df['track_id'] == int(song[-6:])]['artist_name'].to_list()[0]
|
298 |
+
st.write('**"'+song_name+'"**' + ' by ' + artist_name)
|
299 |
+
st.audio(song + ".mp3", format="audio/mp3", start_time=0)
|
300 |
+
|
301 |
+
## Draw audio page
|
302 |
+
def draw_audio(
|
303 |
+
key,
|
304 |
+
plot=False,
|
305 |
+
):
|
306 |
+
|
307 |
+
image = Image.open("data/logo.png")
|
308 |
+
st.image(image, use_column_width="always")
|
309 |
+
|
310 |
+
if 'model' not in st.session_state:
|
311 |
+
#with st.spinner('We are orginizing your traks...'):
|
312 |
+
text_encoder = AutoModel.from_pretrained(CLIP_TEXT_MODEL_PATH, local_files_only=True)
|
313 |
+
vision_encoder = CLIPVisionModel.from_pretrained(CLIP_VISION_MODEL_PATH, local_files_only=True)
|
314 |
+
tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
|
315 |
+
model = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
|
316 |
+
model.compute_image_embeddings(glob.glob(SPECTROGRAMS_PATH+"/*.jpeg")[:5000])
|
317 |
+
st.session_state["model"] = model
|
318 |
+
#st.session_state['model'] = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
|
319 |
+
#st.session_state.model.compute_image_embeddings(glob.glob("/data1/mlaquatra/TSOAI_hack/data/spectrograms/*.jpeg")[:100])
|
320 |
+
#st.success('Done!')
|
321 |
+
|
322 |
+
""
|
323 |
+
""
|
324 |
+
|
325 |
+
st.write("Please, describe the kind of song you are looking for!")
|
326 |
+
stt_button = Button(label="Start Recording", margin=[5,5,5,200], width=200, default_size=10, width_policy='auto', button_type='primary')
|
327 |
+
|
328 |
+
stt_button.js_on_event("button_click", CustomJS(code="""
|
329 |
+
var recognition = new webkitSpeechRecognition();
|
330 |
+
recognition.continuous = false;
|
331 |
+
recognition.interimResults = true;
|
332 |
+
|
333 |
+
recognition.onresult = function (e) {
|
334 |
+
var value = "";
|
335 |
+
for (var i = e.resultIndex; i < e.results.length; ++i) {
|
336 |
+
if (e.results[i].isFinal) {
|
337 |
+
value += e.results[i][0].transcript;
|
338 |
+
}
|
339 |
+
}
|
340 |
+
if ( value != "") {
|
341 |
+
document.dispatchEvent(new CustomEvent("GET_TEXT", {detail: value}));
|
342 |
+
}
|
343 |
+
}
|
344 |
+
recognition.start();
|
345 |
+
"""))
|
346 |
+
|
347 |
+
|
348 |
+
result = streamlit_bokeh_events(
|
349 |
+
stt_button,
|
350 |
+
events="GET_TEXT",
|
351 |
+
key="listen",
|
352 |
+
refresh_on_update=False,
|
353 |
+
override_height=75,
|
354 |
+
debounce_time=0)
|
355 |
+
|
356 |
+
if result:
|
357 |
+
if "GET_TEXT" in result:
|
358 |
+
sentence = result.get("GET_TEXT")
|
359 |
+
st.write('You asked for: "' + sentence + '"')
|
360 |
+
|
361 |
+
song_paths = st.session_state.model.image_search(sentence)
|
362 |
+
for song in song_paths:
|
363 |
+
song_name = df.loc[df['track_id'] == int(song[-6:])]['track_title'].to_list()[0]
|
364 |
+
artist_name = df.loc[df['track_id'] == int(song[-6:])]['artist_name'].to_list()[0]
|
365 |
+
st.write('**"'+song_name+'"**' + ' by ' + artist_name)
|
366 |
+
st.audio(song + ".mp3", format="audio/mp3", start_time=0)
|
367 |
+
|
368 |
+
## Draw camera page
|
369 |
+
def draw_camera(
|
370 |
+
key,
|
371 |
+
plot=False,
|
372 |
+
):
|
373 |
+
|
374 |
+
image = Image.open("data/logo.png")
|
375 |
+
st.image(image, use_column_width="always")
|
376 |
+
|
377 |
+
if 'model' not in st.session_state:
|
378 |
+
#with st.spinner('We are orginizing your traks...'):
|
379 |
+
text_encoder = AutoModel.from_pretrained(CLIP_TEXT_MODEL_PATH, local_files_only=True)
|
380 |
+
vision_encoder = CLIPVisionModel.from_pretrained(CLIP_VISION_MODEL_PATH, local_files_only=True)
|
381 |
+
tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
|
382 |
+
model = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
|
383 |
+
model.compute_image_embeddings(glob.glob(SPECTROGRAMS_PATH + "/*.jpeg")[:5000])
|
384 |
+
st.session_state["model"] = model
|
385 |
+
#st.session_state['model'] = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
|
386 |
+
#st.session_state.model.compute_image_embeddings(glob.glob("/data1/mlaquatra/TSOAI_hack/data/spectrograms/*.jpeg")[:100])
|
387 |
+
#st.success('Done!')
|
388 |
+
|
389 |
+
""
|
390 |
+
""
|
391 |
+
|
392 |
+
st.write("Please, show us how you are feeling today!")
|
393 |
+
captured_image = webcam()
|
394 |
+
if captured_image is None:
|
395 |
+
st.write("Waiting for capture...")
|
396 |
+
else:
|
397 |
+
# st.write("Got an image from the webcam:")
|
398 |
+
|
399 |
+
# st.image(captured_image)
|
400 |
+
|
401 |
+
# st.write(type(captured_image))
|
402 |
+
# st.write(captured_image)
|
403 |
+
# st.write(captured_image.size)
|
404 |
+
|
405 |
+
captured_image = captured_image.convert("RGB")
|
406 |
+
|
407 |
+
vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
|
408 |
+
vit_model = ViTForImageClassification.from_pretrained("ViT_ER/best_checkpoint", local_files_only=True)
|
409 |
+
inputs = vit_feature_extractor(images=[captured_image], return_tensors="pt")
|
410 |
+
outputs = vit_model(**inputs, output_hidden_states=True)
|
411 |
+
#st.write(outputs)
|
412 |
+
emotions = ['Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise', 'Neutral']
|
413 |
+
mood = emotions[np.argmax(outputs.logits.detach().cpu().numpy())]
|
414 |
+
#st.write(mood)
|
415 |
+
|
416 |
+
st.write(f"Your mood seems to be **{mood.lower()}** today! Here's a song for you that matches with how you feel!")
|
417 |
+
|
418 |
+
song_paths = st.session_state.model.image_search(mood)
|
419 |
+
for song in song_paths:
|
420 |
+
song_name = df.loc[df['track_id'] == int(song[-6:])]['track_title'].to_list()[0]
|
421 |
+
artist_name = df.loc[df['track_id'] == int(song[-6:])]['artist_name'].to_list()[0]
|
422 |
+
st.write('**"'+song_name+'"**' + ' by ' + artist_name)
|
423 |
+
st.audio(song + ".mp3", format="audio/mp3", start_time=0)
|
424 |
+
|
425 |
+
|
426 |
+
## Main
|
427 |
+
selected = streamlit_menu(example=3)
|
428 |
+
df = pd.read_csv('full_metadata.csv', index_col=False)
|
429 |
+
|
430 |
+
if selected == "Text":
|
431 |
+
# st.title(f"You have selected {selected}")
|
432 |
+
draw_text("text", plot=True)
|
433 |
+
if selected == "Audio":
|
434 |
+
# st.title(f"You have selected {selected}")
|
435 |
+
draw_audio("audio", plot=True)
|
436 |
+
if selected == "Camera":
|
437 |
+
# st.title(f"You have selected {selected}")
|
438 |
+
#draw_camera("camera", plot=True)
|
439 |
+
continue
|
440 |
+
|
441 |
+
# with st.sidebar:
|
442 |
+
# draw_sidebar("sidebar")
|
full_metadata.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a7420377a357fb7989d375efbf758343279a92f58dec6ac37e825ed5e66f267
|
3 |
+
size 59247389
|
sample_mp3/000153.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:177bdc9d9289e67c4c24e20ba11355fcb4ab814cfac00efaddd591b61a58200a
|
3 |
+
size 961953
|
sample_mp3/000206.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d655b2b716591c30a6eca9926723eb62f0815c7bedd84969e24492bcaaf13a9c
|
3 |
+
size 960695
|
sample_mp3/000787.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0b1d9724bc37d9a64c4b3e02d126add063b60627c179d89c07f878b48ef6679
|
3 |
+
size 960752
|
sample_mp3/000913.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3d42b159e5d930089ab236b56d5aacf9d6b24f49c9edb6c1d72aea057b10a443
|
3 |
+
size 504278
|
sample_mp3/001123.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89ca19afab39ffd8449a44a2389466ab4ee6b15a6562f23ac2cbb2d188b55c85
|
3 |
+
size 960723
|
sample_mp3/001280.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:861dc2286ebb6c9fe7f6915d6446d78de3fde30908299614a2b987a4becd5650
|
3 |
+
size 960724
|
sample_mp3/001360.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a9af0fc2a1248af5213515f69c6e3416c73fe3bf36b4ac1c5662b6321bb3c34d
|
3 |
+
size 480423
|
sample_mp3/001550.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:00d34e0d72c6dbf821ccae9c428b67061014b986271f2fe71f4e17165391671e
|
3 |
+
size 1202281
|
sample_mp3/001662.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f40e2ecd735dcf8fa46bdaade20b793896c9bffc05e4fe445ed2e05d7a067a69
|
3 |
+
size 961033
|
sample_mp3/001713.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:986d0bf1dc96d859a0e39937d1d14751288aa8aaf5156bb16e1cf3cd5eed5e7b
|
3 |
+
size 960728
|
sample_mp3/001718.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc3ef2ee31a2e84af6ef909cb456ae73b37423986c9f37ae77f23870b57fd195
|
3 |
+
size 961570
|
sample_mp3/001732.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c6d3c203b986c38571871b33b10ec10e738254c247f1b811ded3aebbce80445
|
3 |
+
size 962035
|
sample_mp3/001809.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a73fa97a4fd6dc1dfbd0b404a40b5aed16fc961a79b2a089a68a65c62c8ff55
|
3 |
+
size 961575
|
sample_mp3/001935.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:862fba3c5ec7c7d0a2332e4477b84df4a3ee995e2fea3e8590d2b0db2311d8de
|
3 |
+
size 961626
|
sample_mp3/003258.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:73a98db01a8988d0946aa75ea6e1ad3fc989395d1f9e05b216bf361fce73256b
|
3 |
+
size 960717
|
sample_mp3/003357.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:07f6030be1dbbbd355fcc9523efca36137362b626d0600457fdb4aa3054c8678
|
3 |
+
size 961641
|
sample_mp3/003459.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dbe77b83d19a62ccf66a95da0c0b019f0522a1c1e13b7dbe4b827c58c80e70cb
|
3 |
+
size 960855
|
sample_mp3/003505.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d9ea4a12172994dc4802eb34793807419c7089678c8e699f0b09fae9f5ec7b55
|
3 |
+
size 480647
|
sample_mp3/003685.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b5e86e45118faa81e79bf69bd559aed9ad27e97f1ba8edaa796e09a686ae494
|
3 |
+
size 961615
|
sample_mp3/003911.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60eafc9040ebe9eb05fb55eda59e2669c13ff8131ae604fc8dfb6060a300f169
|
3 |
+
size 961787
|
sample_mp3/003981.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52f9a190d2770a2ae983fc8a0132178f0506da930b2e90fba0459500c2b29101
|
3 |
+
size 961152
|
sample_mp3/004183.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3547dbc8eca936b57fe52d4ace1b3f887102edbc57c3bdbb0e49c92ad11a4fc3
|
3 |
+
size 960757
|
sample_mp3/004315.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57c2ab107d2f5aed9a3e166a9d7cf59435b81ffeb85e21280844dead580cadc7
|
3 |
+
size 960839
|
sample_mp3/004539.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:875e58c12d40abc0a29ec27f52c871976a3fdafb30a57c0d32ba18fd3984bdca
|
3 |
+
size 480541
|
sample_mp3/004753.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:770319876401bbcc0b6fd1699a0e725fd2b405db0ff8869e25642160b0503892
|
3 |
+
size 960902
|
sample_mp3/004909.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:970d2fbf3ace11266f514a517d6983f48affcbbbedaa576112aa7f55ab871df0
|
3 |
+
size 239611
|
sample_mp3/005220.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6fe36dbc534378170ca57356748b56823cfd707583cef3055dcd3e3bf3c41de8
|
3 |
+
size 960899
|
sample_mp3/005346.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d506284bbda862ad81eb0cb8e4099fa25cb11533fa92974c8ab770e5cb9a0c84
|
3 |
+
size 959700
|
sample_mp3/005619.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2950c0ae819e225ab1d7e8e597d709108e4500521cf10a4a991a33c6eb0287e5
|
3 |
+
size 960843
|
sample_mp3/006464.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aaeeb524796bc3265b841ee98e144cd0dfeb5ad76dd5210d0ddcc960adb42cc0
|
3 |
+
size 961678
|
sample_mp3/007429.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78562f56f7cec18cf3d9088769834ca87d7adea8a560b22a28865b732ab72cf4
|
3 |
+
size 480917
|
sample_mp3/007537.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:facc110264c04303ad1936a7607356e8e4fb88a22a8fb1fcc8fbf01216d9d4e7
|
3 |
+
size 960723
|
sample_mp3/007552.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8935b1a4d761ef7047b0ac3d74a90a5ad044ab686900a07d0203d8137b76771d
|
3 |
+
size 961534
|
sample_mp3/007559.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:30aa675ea1e0255fcd01332e367a914b55cbe07dc472462338a130dd5b596df2
|
3 |
+
size 961351
|
sample_mp3/007937.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35eb4a96e49dd8024dbdf05278f87ca6f4a5c4088e1af81d6a9b27c17dac8551
|
3 |
+
size 721656
|
sample_mp3/008140.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:408fab585f3564b40d92292b38e5e77b4d38142b04479a6230c2c266f85d7188
|
3 |
+
size 960921
|
sample_mp3/008168.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50312b4e318d7d7b57c32c51c2d37a3742080c4249f79adbb3409d0ab0b63221
|
3 |
+
size 1200980
|
sample_mp3/008370.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c3b4f49a8e76e44bd52e3669541dd1f03580e3e78d753c97eb836366944f004
|
3 |
+
size 961779
|
sample_mp3/008971.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:147e3f4070bd869b31f302e8a4d147a3998bef04518642c90098fe0123f7f48a
|
3 |
+
size 480859
|
sample_mp3/009210.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1aa1db5312d8ad2762a367a94d483883c0340cc15395b8f23acee8e90fb5bf80
|
3 |
+
size 961140
|
sample_mp3/009804.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a326311a907087fbe043b1a96afb7f9a19a3a341cace2b812366e23f5b9503d
|
3 |
+
size 961070
|
sample_mp3/009955.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:234d89501866dae0e559bb1d1da04a216c64b357ecfec2c8a8187544c7e5733e
|
3 |
+
size 721703
|
sample_mp3/010035.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eaff9f558301e5b67120c350a095f2778db161a6147c4bac44c0a793d0486074
|
3 |
+
size 961524
|
sample_mp3/010109.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d590ef652fe116f6190edd7482864971ad5ddae98f12cf46e78747b9c806d13
|
3 |
+
size 1202331
|
sample_mp3/010212.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e1ef6246dd56854af002ab57e53b21c904a17be3e4804cb4c021aec7d9ead3e6
|
3 |
+
size 721062
|
sample_mp3/010259.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8cb003c9850c306a9c3ae4048f69aaeec48cdf7480ba5c6b955a74171b00ae93
|
3 |
+
size 600514
|
sample_mp3/010713.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3fb8955567041ad8e554e887da0888f418e1e72feeb9a695070f5ff4b9dc8d2a
|
3 |
+
size 719905
|