helloWorld199 commited on
Commit
15ac9eb
·
verified ·
1 Parent(s): 5104d99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -57
app.py CHANGED
@@ -82,7 +82,7 @@ def analyze(path):
82
  json_structure_output = os.path.join(root, file_path)
83
  print(json_structure_output)
84
 
85
- add_voice_labelv2(json_structure_output, string_path)
86
 
87
  fig = allin1.visualize(
88
  result,
@@ -124,50 +124,49 @@ def analyze(path):
124
  #return result.bpm, fig, sonif_path, elapsed_time
125
  return result.bpm, fig, elapsed_time, json_structure_output, bass_path, drums_path, other_path, vocals_path
126
 
127
- def add_voice_label(json_file, audio_path):
128
- # Load the JSON file
129
- with open(json_file, 'r') as f:
130
- data = json.load(f)
131
-
132
- # Create VAD object
133
- vad_iterator = VADIterator(model)
134
-
135
- # Read input audio file
136
- wav, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
137
 
138
- # Access the segments
139
- segments = data['segments']
140
-
141
- times = []
142
- for segment in segments:
143
- start = segment['start']
144
- end = segment['end']
145
-
146
- start_sample = int(start*SAMPLING_RATE)
147
- end_sample = int(end*SAMPLING_RATE)
148
-
149
- speech_probs = []
150
- window_size_samples = 1536
151
- for i in range(start_sample, end_sample, window_size_samples):
152
- chunk = torch.from_numpy(wav[i: i+ window_size_samples])
153
- if len(chunk) < window_size_samples:
154
- break
155
- speech_prob = model(chunk, SAMPLING_RATE).item()
156
- speech_probs.append(speech_prob)
157
- vad_iterator.reset_states() # reset model states after each audio
158
-
159
- mean_probability = np.mean(speech_probs)
160
- print(mean_probability)
161
-
162
- if mean_probability >= 0.7 :
163
- segment['voice'] = "Yes"
164
- else:
165
- segment['voice'] = "No"
166
-
167
- with open(json_file, 'w') as f:
168
- json.dump(data, f, indent=4)
169
 
170
- def add_voice_labelv2(json_file, audio_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  # Load the JSON file
172
  with open(json_file, 'r') as f:
173
  data = json.load(f)
@@ -179,7 +178,9 @@ def add_voice_labelv2(json_file, audio_path):
179
  wav, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
180
 
181
  speech_probs = []
182
- # Size of the window we compute the probability on
 
 
183
  window_size_samples = int(SAMPLING_RATE/4)
184
  for i in range(0, len(wav), window_size_samples):
185
  chunk = torch.from_numpy(wav[i: i+ window_size_samples])
@@ -204,23 +205,20 @@ def add_voice_labelv2(json_file, audio_path):
204
  begin_seq = False
205
  if voice_idxs[i+1] == voice_idxs[i]+1:
206
  continue
207
-
208
  start_time = float((start_idx*window_size_samples)/SAMPLING_RATE)
209
  end_time = float((voice_idxs[i]*window_size_samples)/SAMPLING_RATE)
210
-
211
- start_minutes = int(start_time)
212
- end_minutes = int(end_time)
213
- start_seconds = (start_time - start_minutes) * 60
214
- end_seconds = (end_time - end_minutes) * 60
215
-
216
- print("modifying json data... \n")
217
  vocal_times.append( {
218
- "start_time": f"{start_minutes}.{start_seconds:.0f}",
219
- "end_time": f"{end_minutes}.{end_seconds:.0f}"
220
- })
221
-
 
222
  begin_seq = True
223
-
 
 
224
  data['vocal_times'] = vocal_times
225
 
226
  except Exception as e:
 
82
  json_structure_output = os.path.join(root, file_path)
83
  print(json_structure_output)
84
 
85
+ add_voice_label(json_structure_output, string_path)
86
 
87
  fig = allin1.visualize(
88
  result,
 
124
  #return result.bpm, fig, sonif_path, elapsed_time
125
  return result.bpm, fig, elapsed_time, json_structure_output, bass_path, drums_path, other_path, vocals_path
126
 
127
+ def aggregate_vocal_times(vocal_time):
128
+ """
129
+ Aggregates multiple vocal segments into one single segment. This is done because
130
+ usually segments are very short (<3 seconds) sections of audio.
131
+ """
 
 
 
 
 
132
 
133
+ # This is an hyperparameter for the aggregation of the segments. This means we aggregate
134
+ # until we don't find a segment which has a start_time NEXT_SEGMENT_SECONDS after the end_time
135
+ # of the previous segment
136
+ NEXT_SEGMENT_SECONDS = 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
+ try:
139
+ start_time = 0.0
140
+ end_time = 0.0
141
+ begin_seq = True
142
+ compressed_vocal_times = []
143
+ for vocal_time in vocal_times:
144
+ if begin_seq:
145
+ start_time = vocal_time['start_time']
146
+ end_time = vocal_time['end_time']
147
+ begin_seq = False
148
+ continue
149
+ if float(vocal_time['start_time']) < float(end_time) + NEXT_SEGMENT_SECONDS:
150
+ end_time = vocal_time['end_time']
151
+ else:
152
+ print(start_time, end_time)
153
+ compressed_vocal_times.append( {
154
+ "start_time": f"{start_time}",
155
+ "end_time": f"{end_time}"
156
+ }
157
+ )
158
+ start_time = vocal_time['start_time']
159
+ end_time = vocal_time['end_time']
160
+ compressed_vocal_times.append( {
161
+ "start_time": f"{start_time}",
162
+ "end_time": f"{end_time}"
163
+ }
164
+ )
165
+ except Exception as e:
166
+ print(f"An exception occurred: {e}")
167
+ return compressed_vocal_times
168
+
169
+ def add_voice_label(json_file, audio_path):
170
  # Load the JSON file
171
  with open(json_file, 'r') as f:
172
  data = json.load(f)
 
178
  wav, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
179
 
180
  speech_probs = []
181
+ # Size of the window we compute the probability on.
182
+ # This is an hyperparameter for the detection and can be changed to obtain different
183
+ # result. I found this to be optimal.
184
  window_size_samples = int(SAMPLING_RATE/4)
185
  for i in range(0, len(wav), window_size_samples):
186
  chunk = torch.from_numpy(wav[i: i+ window_size_samples])
 
205
  begin_seq = False
206
  if voice_idxs[i+1] == voice_idxs[i]+1:
207
  continue
208
+
209
  start_time = float((start_idx*window_size_samples)/SAMPLING_RATE)
210
  end_time = float((voice_idxs[i]*window_size_samples)/SAMPLING_RATE)
211
+
 
 
 
 
 
 
212
  vocal_times.append( {
213
+ "start_time": f"{start_time:.2f}",
214
+ "end_time": f"{end_time:.2f}"
215
+ }
216
+ )
217
+
218
  begin_seq = True
219
+
220
+
221
+ vocal_times = aggregate_vocal_times(vocal_times)
222
  data['vocal_times'] = vocal_times
223
 
224
  except Exception as e: