all-in-one

Running

App Files Files Community

helloWorld199 commited on Jul 7, 2024

Commit

15ac9eb

verified ·

1 Parent(s): 5104d99

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -57

app.py CHANGED Viewed

@@ -82,7 +82,7 @@ def analyze(path):
       json_structure_output = os.path.join(root, file_path)
       print(json_structure_output)
-  add_voice_labelv2(json_structure_output, string_path)
   fig = allin1.visualize(
     result,
@@ -124,50 +124,49 @@ def analyze(path):
   #return result.bpm, fig, sonif_path, elapsed_time
   return result.bpm, fig, elapsed_time, json_structure_output, bass_path, drums_path, other_path, vocals_path
-def add_voice_label(json_file, audio_path):
-    # Load the JSON file
-    with open(json_file, 'r') as f:
-        data = json.load(f)
-    # Create VAD object
-    vad_iterator = VADIterator(model)
-    # Read input audio file
-    wav, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
-    # Access the segments
-    segments = data['segments']
-    times = []
-    for segment in segments:
-        start = segment['start']
-        end = segment['end']
-        start_sample = int(start*SAMPLING_RATE)
-        end_sample = int(end*SAMPLING_RATE)
-        speech_probs = []
-        window_size_samples = 1536
-        for i in range(start_sample, end_sample, window_size_samples):
-            chunk = torch.from_numpy(wav[i: i+ window_size_samples])
-            if len(chunk) < window_size_samples:
-              break
-            speech_prob = model(chunk, SAMPLING_RATE).item()
-            speech_probs.append(speech_prob)
-        vad_iterator.reset_states() # reset model states after each audio
-        mean_probability = np.mean(speech_probs)
-        print(mean_probability)
-        if mean_probability >= 0.7 :
-            segment['voice'] = "Yes"
-        else:
-            segment['voice'] = "No"
-    with open(json_file, 'w') as f:
-        json.dump(data, f, indent=4)
-def add_voice_labelv2(json_file, audio_path):
     # Load the JSON file
     with open(json_file, 'r') as f:
         data = json.load(f)
@@ -179,7 +178,9 @@ def add_voice_labelv2(json_file, audio_path):
     wav, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
     speech_probs = []
-    # Size of the window we compute the probability on
     window_size_samples = int(SAMPLING_RATE/4)
     for i in range(0, len(wav), window_size_samples):
         chunk = torch.from_numpy(wav[i: i+ window_size_samples])
@@ -204,23 +205,20 @@ def add_voice_labelv2(json_file, audio_path):
                 begin_seq = False
             if voice_idxs[i+1] == voice_idxs[i]+1:
                 continue
             start_time = float((start_idx*window_size_samples)/SAMPLING_RATE)
             end_time = float((voice_idxs[i]*window_size_samples)/SAMPLING_RATE)
-            start_minutes = int(start_time)
-            end_minutes = int(end_time)
-            start_seconds = (start_time - start_minutes) * 60
-            end_seconds = (end_time - end_minutes) * 60
-            print("modifying json data... \n")
             vocal_times.append( {
-            "start_time": f"{start_minutes}.{start_seconds:.0f}",
-            "end_time": f"{end_minutes}.{end_seconds:.0f}"
-            })
             begin_seq = True
         data['vocal_times'] = vocal_times
     except Exception as e:

       json_structure_output = os.path.join(root, file_path)
       print(json_structure_output)
+  add_voice_label(json_structure_output, string_path)
   fig = allin1.visualize(
     result,
   #return result.bpm, fig, sonif_path, elapsed_time
   return result.bpm, fig, elapsed_time, json_structure_output, bass_path, drums_path, other_path, vocals_path
+def aggregate_vocal_times(vocal_time):
+    """
+        Aggregates multiple vocal segments into one single segment. This is done because
+        usually segments are very short (<3 seconds) sections of audio.
+    """
+    # This is an hyperparameter for the aggregation of the segments. This means we aggregate
+    # until we don't find a segment which has a start_time NEXT_SEGMENT_SECONDS after the end_time
+    # of the previous segment
+    NEXT_SEGMENT_SECONDS = 5
+    try:
+        start_time = 0.0
+        end_time = 0.0
+        begin_seq = True
+        compressed_vocal_times = []
+        for vocal_time in vocal_times:
+          if begin_seq:
+            start_time = vocal_time['start_time']
+            end_time = vocal_time['end_time']
+            begin_seq = False
+            continue
+          if float(vocal_time['start_time']) < float(end_time) + NEXT_SEGMENT_SECONDS:
+            end_time = vocal_time['end_time']
+          else:
+            print(start_time, end_time)
+            compressed_vocal_times.append( {
+              "start_time": f"{start_time}",
+              "end_time": f"{end_time}"
+            }
+            )
+            start_time = vocal_time['start_time']
+            end_time = vocal_time['end_time']
+        compressed_vocal_times.append( {
+            "start_time": f"{start_time}",
+            "end_time": f"{end_time}"
+        }
+        )
+      except Exception as e:
+        print(f"An exception occurred: {e}")
+      return compressed_vocal_times
+def add_voice_label(json_file, audio_path):
     # Load the JSON file
     with open(json_file, 'r') as f:
         data = json.load(f)
     wav, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
     speech_probs = []
+    # Size of the window we compute the probability on.
+    # This is an hyperparameter for the detection and can be changed to obtain different
+    # result. I found this to be optimal.
     window_size_samples = int(SAMPLING_RATE/4)
     for i in range(0, len(wav), window_size_samples):
         chunk = torch.from_numpy(wav[i: i+ window_size_samples])
                 begin_seq = False
             if voice_idxs[i+1] == voice_idxs[i]+1:
                 continue
             start_time = float((start_idx*window_size_samples)/SAMPLING_RATE)
             end_time = float((voice_idxs[i]*window_size_samples)/SAMPLING_RATE)
             vocal_times.append( {
+            "start_time": f"{start_time:.2f}",
+            "end_time": f"{end_time:.2f}"
+            }
+            )
             begin_seq = True
+        vocal_times = aggregate_vocal_times(vocal_times)
         data['vocal_times'] = vocal_times
     except Exception as e: