Spaces:
Running
Running
Epsilon617
commited on
Commit
•
cb013a1
1
Parent(s):
479afcd
reformatting the outputs as dataframe
Browse files
Prediction_Head/__pycache__/MTGGenre_head.cpython-310.pyc
ADDED
Binary file (1.67 kB). View file
|
|
__pycache__/app.cpython-310.pyc
CHANGED
Binary files a/__pycache__/app.cpython-310.pyc and b/__pycache__/app.cpython-310.pyc differ
|
|
app.py
CHANGED
@@ -10,6 +10,9 @@ import logging
|
|
10 |
|
11 |
import json
|
12 |
import os
|
|
|
|
|
|
|
13 |
|
14 |
import importlib
|
15 |
modeling_MERT = importlib.import_module("MERT-v1-95M.modeling_MERT")
|
@@ -36,8 +39,7 @@ inputs = [
|
|
36 |
live_inputs = [
|
37 |
gr.Audio(source="microphone",streaming=True, type="filepath"),
|
38 |
]
|
39 |
-
|
40 |
-
# outputs = [gr.components.Textbox(), transcription_df]
|
41 |
title = "One Model for All Music Understanding Tasks"
|
42 |
description = "An example of using the [MERT-v1-95M](https://huggingface.co/m-a-p/MERT-v1-95M) model as backbone to conduct multiple music understanding tasks with the universal represenation."
|
43 |
article = "The tasks include EMO, GS, MTGInstrument, MTGGenre, MTGTop50, MTGMood, NSynthI, NSynthP, VocalSetS, VocalSetT. \n\n More models can be referred at the [map organization page](https://huggingface.co/m-a-p)."
|
@@ -46,6 +48,17 @@ audio_examples = [
|
|
46 |
# ["input/example-2.wav"],
|
47 |
]
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# Load the model and the corresponding preprocessor config
|
50 |
# model = AutoModel.from_pretrained("m-a-p/MERT-v0-public", trust_remote_code=True)
|
51 |
# processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v0-public",trust_remote_code=True)
|
@@ -105,6 +118,7 @@ for task in TASKS:
|
|
105 |
|
106 |
model.to(device)
|
107 |
|
|
|
108 |
def model_infernce(inputs):
|
109 |
waveform, sample_rate = torchaudio.load(inputs)
|
110 |
|
@@ -112,7 +126,7 @@ def model_infernce(inputs):
|
|
112 |
|
113 |
# make sure the sample_rate aligned
|
114 |
if resample_rate != sample_rate:
|
115 |
-
print(f'setting rate from {sample_rate} to {resample_rate}')
|
116 |
resampler = T.Resample(sample_rate, resample_rate)
|
117 |
waveform = resampler(waveform)
|
118 |
|
@@ -129,13 +143,16 @@ def model_infernce(inputs):
|
|
129 |
all_layer_hidden_states = all_layer_hidden_states.mean(dim=2)
|
130 |
|
131 |
task_output_texts = ""
|
|
|
|
|
|
|
132 |
for task in TASKS:
|
133 |
num_class = len(ID2CLASS[task].keys())
|
134 |
if MERT_BEST_LAYER_IDX[task] == 'all':
|
135 |
logits = CLASSIFIERS[task](all_layer_hidden_states) # [1, 87]
|
136 |
else:
|
137 |
logits = CLASSIFIERS[task](all_layer_hidden_states[:, MERT_BEST_LAYER_IDX[task]])
|
138 |
-
print(f'task {task} logits:', logits.shape, 'num class:', num_class)
|
139 |
|
140 |
sorted_idx = torch.argsort(logits, dim = -1, descending=True)[0] # batch =1
|
141 |
sorted_prob,_ = torch.sort(nn.functional.softmax(logits[0], dim=-1), dim=-1, descending=True)
|
@@ -145,33 +162,40 @@ def model_infernce(inputs):
|
|
145 |
top_n_show = 3 if num_class >= 3 else num_class
|
146 |
task_output_texts = task_output_texts + f"TASK {task} output:\n" + "\n".join([str(ID2CLASS[task][str(sorted_idx[idx].item())])+f', probability: {sorted_prob[idx].item():.2%}' for idx in range(top_n_show)]) + '\n'
|
147 |
task_output_texts = task_output_texts + '----------------------\n'
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
def convert_audio(inputs, microphone):
|
156 |
if (microphone is not None):
|
157 |
inputs = microphone
|
|
|
|
|
158 |
|
159 |
-
text = model_infernce(inputs)
|
160 |
-
|
161 |
-
return text
|
162 |
-
|
163 |
def live_convert_audio(microphone):
|
164 |
if (microphone is not None):
|
165 |
inputs = microphone
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
return text
|
170 |
|
171 |
audio_chunked = gr.Interface(
|
172 |
fn=convert_audio,
|
173 |
inputs=inputs,
|
174 |
-
outputs=
|
175 |
allow_flagging="never",
|
176 |
title=title,
|
177 |
description=description,
|
@@ -182,7 +206,7 @@ audio_chunked = gr.Interface(
|
|
182 |
live_audio_chunked = gr.Interface(
|
183 |
fn=live_convert_audio,
|
184 |
inputs=live_inputs,
|
185 |
-
outputs=
|
186 |
allow_flagging="never",
|
187 |
title=title,
|
188 |
description=description,
|
@@ -204,5 +228,5 @@ with demo:
|
|
204 |
"Live Streaming Music"
|
205 |
]
|
206 |
)
|
207 |
-
demo.queue(concurrency_count=1, max_size=5)
|
208 |
demo.launch(show_api=False)
|
|
|
10 |
|
11 |
import json
|
12 |
import os
|
13 |
+
import re
|
14 |
+
|
15 |
+
import pandas as pd
|
16 |
|
17 |
import importlib
|
18 |
modeling_MERT = importlib.import_module("MERT-v1-95M.modeling_MERT")
|
|
|
39 |
live_inputs = [
|
40 |
gr.Audio(source="microphone",streaming=True, type="filepath"),
|
41 |
]
|
42 |
+
|
|
|
43 |
title = "One Model for All Music Understanding Tasks"
|
44 |
description = "An example of using the [MERT-v1-95M](https://huggingface.co/m-a-p/MERT-v1-95M) model as backbone to conduct multiple music understanding tasks with the universal represenation."
|
45 |
article = "The tasks include EMO, GS, MTGInstrument, MTGGenre, MTGTop50, MTGMood, NSynthI, NSynthP, VocalSetS, VocalSetT. \n\n More models can be referred at the [map organization page](https://huggingface.co/m-a-p)."
|
|
|
48 |
# ["input/example-2.wav"],
|
49 |
]
|
50 |
|
51 |
+
df_init = pd.DataFrame(columns=['Task', 'Top 1', 'Top 2', 'Top 3'])
|
52 |
+
transcription_df = gr.DataFrame(value=df_init, label="Output Dataframe", row_count=(
|
53 |
+
0, "dynamic"), max_rows=30, wrap=True, overflow_row_behaviour='paginate')
|
54 |
+
# outputs = [gr.components.Textbox()]
|
55 |
+
outputs = [ transcription_df]
|
56 |
+
|
57 |
+
df_init_live = pd.DataFrame(columns=['Task', 'Top 1', 'Top 2', 'Top 3'])
|
58 |
+
transcription_df_live = gr.DataFrame(value=df_init_live, label="Output Dataframe", row_count=(
|
59 |
+
0, "dynamic"), max_rows=30, wrap=True, overflow_row_behaviour='paginate')
|
60 |
+
outputs_live = [transcription_df_live]
|
61 |
+
|
62 |
# Load the model and the corresponding preprocessor config
|
63 |
# model = AutoModel.from_pretrained("m-a-p/MERT-v0-public", trust_remote_code=True)
|
64 |
# processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v0-public",trust_remote_code=True)
|
|
|
118 |
|
119 |
model.to(device)
|
120 |
|
121 |
+
|
122 |
def model_infernce(inputs):
|
123 |
waveform, sample_rate = torchaudio.load(inputs)
|
124 |
|
|
|
126 |
|
127 |
# make sure the sample_rate aligned
|
128 |
if resample_rate != sample_rate:
|
129 |
+
# print(f'setting rate from {sample_rate} to {resample_rate}')
|
130 |
resampler = T.Resample(sample_rate, resample_rate)
|
131 |
waveform = resampler(waveform)
|
132 |
|
|
|
143 |
all_layer_hidden_states = all_layer_hidden_states.mean(dim=2)
|
144 |
|
145 |
task_output_texts = ""
|
146 |
+
df = pd.DataFrame(columns=['Task', 'Top 1', 'Top 2', 'Top 3'])
|
147 |
+
df_objects = []
|
148 |
+
|
149 |
for task in TASKS:
|
150 |
num_class = len(ID2CLASS[task].keys())
|
151 |
if MERT_BEST_LAYER_IDX[task] == 'all':
|
152 |
logits = CLASSIFIERS[task](all_layer_hidden_states) # [1, 87]
|
153 |
else:
|
154 |
logits = CLASSIFIERS[task](all_layer_hidden_states[:, MERT_BEST_LAYER_IDX[task]])
|
155 |
+
# print(f'task {task} logits:', logits.shape, 'num class:', num_class)
|
156 |
|
157 |
sorted_idx = torch.argsort(logits, dim = -1, descending=True)[0] # batch =1
|
158 |
sorted_prob,_ = torch.sort(nn.functional.softmax(logits[0], dim=-1), dim=-1, descending=True)
|
|
|
162 |
top_n_show = 3 if num_class >= 3 else num_class
|
163 |
task_output_texts = task_output_texts + f"TASK {task} output:\n" + "\n".join([str(ID2CLASS[task][str(sorted_idx[idx].item())])+f', probability: {sorted_prob[idx].item():.2%}' for idx in range(top_n_show)]) + '\n'
|
164 |
task_output_texts = task_output_texts + '----------------------\n'
|
165 |
+
|
166 |
+
row_elements = [task]
|
167 |
+
for idx in range(top_n_show):
|
168 |
+
print(ID2CLASS[task])
|
169 |
+
# print('id', str(sorted_idx[idx].item()))
|
170 |
+
output_class_name = str(ID2CLASS[task][str(sorted_idx[idx].item())])
|
171 |
+
output_class_name = re.sub(r'^\w+---', '', output_class_name)
|
172 |
+
output_class_name = re.sub(r'^\w+\/\w+---', '', output_class_name)
|
173 |
+
# print('output name', output_class_name)
|
174 |
+
output_prob = f' {sorted_prob[idx].item():.2%}'
|
175 |
+
row_elements.append(output_class_name+output_prob)
|
176 |
+
# fill empty elment
|
177 |
+
for _ in range(4 - len(row_elements)):
|
178 |
+
row_elements.append(' ')
|
179 |
+
df_objects.append(row_elements)
|
180 |
+
df = pd.DataFrame(df_objects, columns=['Task', 'Top 1', 'Top 2', 'Top 3'])
|
181 |
+
return df
|
182 |
|
183 |
def convert_audio(inputs, microphone):
|
184 |
if (microphone is not None):
|
185 |
inputs = microphone
|
186 |
+
df = model_infernce(inputs)
|
187 |
+
return df
|
188 |
|
|
|
|
|
|
|
|
|
189 |
def live_convert_audio(microphone):
|
190 |
if (microphone is not None):
|
191 |
inputs = microphone
|
192 |
+
df = model_infernce(inputs)
|
193 |
+
return df
|
|
|
|
|
194 |
|
195 |
audio_chunked = gr.Interface(
|
196 |
fn=convert_audio,
|
197 |
inputs=inputs,
|
198 |
+
outputs=outputs,
|
199 |
allow_flagging="never",
|
200 |
title=title,
|
201 |
description=description,
|
|
|
206 |
live_audio_chunked = gr.Interface(
|
207 |
fn=live_convert_audio,
|
208 |
inputs=live_inputs,
|
209 |
+
outputs=outputs_live,
|
210 |
allow_flagging="never",
|
211 |
title=title,
|
212 |
description=description,
|
|
|
228 |
"Live Streaming Music"
|
229 |
]
|
230 |
)
|
231 |
+
# demo.queue(concurrency_count=1, max_size=5)
|
232 |
demo.launch(show_api=False)
|