Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -82,7 +82,7 @@ def analyze(path):
|
|
82 |
json_structure_output = os.path.join(root, file_path)
|
83 |
print(json_structure_output)
|
84 |
|
85 |
-
|
86 |
|
87 |
fig = allin1.visualize(
|
88 |
result,
|
@@ -124,50 +124,49 @@ def analyze(path):
|
|
124 |
#return result.bpm, fig, sonif_path, elapsed_time
|
125 |
return result.bpm, fig, elapsed_time, json_structure_output, bass_path, drums_path, other_path, vocals_path
|
126 |
|
127 |
-
def
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
# Create VAD object
|
133 |
-
vad_iterator = VADIterator(model)
|
134 |
-
|
135 |
-
# Read input audio file
|
136 |
-
wav, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
|
137 |
|
138 |
-
#
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
for segment in segments:
|
143 |
-
start = segment['start']
|
144 |
-
end = segment['end']
|
145 |
-
|
146 |
-
start_sample = int(start*SAMPLING_RATE)
|
147 |
-
end_sample = int(end*SAMPLING_RATE)
|
148 |
-
|
149 |
-
speech_probs = []
|
150 |
-
window_size_samples = 1536
|
151 |
-
for i in range(start_sample, end_sample, window_size_samples):
|
152 |
-
chunk = torch.from_numpy(wav[i: i+ window_size_samples])
|
153 |
-
if len(chunk) < window_size_samples:
|
154 |
-
break
|
155 |
-
speech_prob = model(chunk, SAMPLING_RATE).item()
|
156 |
-
speech_probs.append(speech_prob)
|
157 |
-
vad_iterator.reset_states() # reset model states after each audio
|
158 |
-
|
159 |
-
mean_probability = np.mean(speech_probs)
|
160 |
-
print(mean_probability)
|
161 |
-
|
162 |
-
if mean_probability >= 0.7 :
|
163 |
-
segment['voice'] = "Yes"
|
164 |
-
else:
|
165 |
-
segment['voice'] = "No"
|
166 |
-
|
167 |
-
with open(json_file, 'w') as f:
|
168 |
-
json.dump(data, f, indent=4)
|
169 |
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
# Load the JSON file
|
172 |
with open(json_file, 'r') as f:
|
173 |
data = json.load(f)
|
@@ -179,7 +178,9 @@ def add_voice_labelv2(json_file, audio_path):
|
|
179 |
wav, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
|
180 |
|
181 |
speech_probs = []
|
182 |
-
# Size of the window we compute the probability on
|
|
|
|
|
183 |
window_size_samples = int(SAMPLING_RATE/4)
|
184 |
for i in range(0, len(wav), window_size_samples):
|
185 |
chunk = torch.from_numpy(wav[i: i+ window_size_samples])
|
@@ -204,23 +205,20 @@ def add_voice_labelv2(json_file, audio_path):
|
|
204 |
begin_seq = False
|
205 |
if voice_idxs[i+1] == voice_idxs[i]+1:
|
206 |
continue
|
207 |
-
|
208 |
start_time = float((start_idx*window_size_samples)/SAMPLING_RATE)
|
209 |
end_time = float((voice_idxs[i]*window_size_samples)/SAMPLING_RATE)
|
210 |
-
|
211 |
-
start_minutes = int(start_time)
|
212 |
-
end_minutes = int(end_time)
|
213 |
-
start_seconds = (start_time - start_minutes) * 60
|
214 |
-
end_seconds = (end_time - end_minutes) * 60
|
215 |
-
|
216 |
-
print("modifying json data... \n")
|
217 |
vocal_times.append( {
|
218 |
-
"start_time": f"{
|
219 |
-
"end_time": f"{
|
220 |
-
}
|
221 |
-
|
|
|
222 |
begin_seq = True
|
223 |
-
|
|
|
|
|
224 |
data['vocal_times'] = vocal_times
|
225 |
|
226 |
except Exception as e:
|
|
|
82 |
json_structure_output = os.path.join(root, file_path)
|
83 |
print(json_structure_output)
|
84 |
|
85 |
+
add_voice_label(json_structure_output, string_path)
|
86 |
|
87 |
fig = allin1.visualize(
|
88 |
result,
|
|
|
124 |
#return result.bpm, fig, sonif_path, elapsed_time
|
125 |
return result.bpm, fig, elapsed_time, json_structure_output, bass_path, drums_path, other_path, vocals_path
|
126 |
|
127 |
+
def aggregate_vocal_times(vocal_time):
|
128 |
+
"""
|
129 |
+
Aggregates multiple vocal segments into one single segment. This is done because
|
130 |
+
usually segments are very short (<3 seconds) sections of audio.
|
131 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
+
# This is an hyperparameter for the aggregation of the segments. This means we aggregate
|
134 |
+
# until we don't find a segment which has a start_time NEXT_SEGMENT_SECONDS after the end_time
|
135 |
+
# of the previous segment
|
136 |
+
NEXT_SEGMENT_SECONDS = 5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
+
try:
|
139 |
+
start_time = 0.0
|
140 |
+
end_time = 0.0
|
141 |
+
begin_seq = True
|
142 |
+
compressed_vocal_times = []
|
143 |
+
for vocal_time in vocal_times:
|
144 |
+
if begin_seq:
|
145 |
+
start_time = vocal_time['start_time']
|
146 |
+
end_time = vocal_time['end_time']
|
147 |
+
begin_seq = False
|
148 |
+
continue
|
149 |
+
if float(vocal_time['start_time']) < float(end_time) + NEXT_SEGMENT_SECONDS:
|
150 |
+
end_time = vocal_time['end_time']
|
151 |
+
else:
|
152 |
+
print(start_time, end_time)
|
153 |
+
compressed_vocal_times.append( {
|
154 |
+
"start_time": f"{start_time}",
|
155 |
+
"end_time": f"{end_time}"
|
156 |
+
}
|
157 |
+
)
|
158 |
+
start_time = vocal_time['start_time']
|
159 |
+
end_time = vocal_time['end_time']
|
160 |
+
compressed_vocal_times.append( {
|
161 |
+
"start_time": f"{start_time}",
|
162 |
+
"end_time": f"{end_time}"
|
163 |
+
}
|
164 |
+
)
|
165 |
+
except Exception as e:
|
166 |
+
print(f"An exception occurred: {e}")
|
167 |
+
return compressed_vocal_times
|
168 |
+
|
169 |
+
def add_voice_label(json_file, audio_path):
|
170 |
# Load the JSON file
|
171 |
with open(json_file, 'r') as f:
|
172 |
data = json.load(f)
|
|
|
178 |
wav, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
|
179 |
|
180 |
speech_probs = []
|
181 |
+
# Size of the window we compute the probability on.
|
182 |
+
# This is an hyperparameter for the detection and can be changed to obtain different
|
183 |
+
# result. I found this to be optimal.
|
184 |
window_size_samples = int(SAMPLING_RATE/4)
|
185 |
for i in range(0, len(wav), window_size_samples):
|
186 |
chunk = torch.from_numpy(wav[i: i+ window_size_samples])
|
|
|
205 |
begin_seq = False
|
206 |
if voice_idxs[i+1] == voice_idxs[i]+1:
|
207 |
continue
|
208 |
+
|
209 |
start_time = float((start_idx*window_size_samples)/SAMPLING_RATE)
|
210 |
end_time = float((voice_idxs[i]*window_size_samples)/SAMPLING_RATE)
|
211 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
vocal_times.append( {
|
213 |
+
"start_time": f"{start_time:.2f}",
|
214 |
+
"end_time": f"{end_time:.2f}"
|
215 |
+
}
|
216 |
+
)
|
217 |
+
|
218 |
begin_seq = True
|
219 |
+
|
220 |
+
|
221 |
+
vocal_times = aggregate_vocal_times(vocal_times)
|
222 |
data['vocal_times'] = vocal_times
|
223 |
|
224 |
except Exception as e:
|