M2UGen330-Demo

Runtime error

App Files Files Community

crypto-code commited on Jan 3, 2024

Commit

686c4ae

1 Parent(s): d9cb0bd

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -12

app.py CHANGED Viewed

@@ -20,6 +20,7 @@ import torchvision.transforms as transforms
 import av
 import subprocess
 import librosa
 args = {"model": "./ckpts/checkpoint.pth", "llama_type": "7B", "llama_dir": "./ckpts/LLaMA-2",
         "mert_path": "m-a-p/MERT-v1-330M", "vit_path": "google/vit-base-patch16-224", "vivit_path": "google/vivit-b-16x2-kinetics400",
@@ -33,8 +34,6 @@ class dotdict(dict):
 args = dotdict(args)
-generated_audio_files = []
 llama_type = args.llama_type
 llama_ckpt_dir = os.path.join(args.llama_dir, llama_type)
 llama_tokenzier_path = args.llama_dir
@@ -118,7 +117,6 @@ def parse_text(text, image_path, video_path, audio_path):
 def save_audio_to_local(audio, sec):
-    global generated_audio_files
     if not os.path.exists('temp'):
         os.mkdir('temp')
     filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.wav')
@@ -126,7 +124,6 @@ def save_audio_to_local(audio, sec):
         scipy.io.wavfile.write(filename, rate=16000, data=audio[0])
     else:
         scipy.io.wavfile.write(filename, rate=model.generation_model.config.audio_encoder.sampling_rate, data=audio)
-    generated_audio_files.append(filename)
     return filename
@@ -166,8 +163,6 @@ def reset_dialog():
 def reset_state():
-    global generated_audio_files
-    generated_audio_files = []
     return None, None, None, None, [], [], []
@@ -214,6 +209,12 @@ def get_video_length(filename):
 def get_audio_length(filename):
     return int(round(librosa.get_duration(path=filename)))
 def predict(
         prompt_input,
@@ -226,7 +227,6 @@ def predict(
         history,
         modality_cache,
         audio_length_in_s):
-    global generated_audio_files
     prompts = [llama.format_prompt(prompt_input)]
     prompts = [model.tokenizer(x).input_ids for x in prompts]
     print(image_path, audio_path, video_path)
@@ -244,11 +244,11 @@ def predict(
         container = av.open(video_path)
         indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
         video = read_video_pyav(container=container, indices=indices)
-    if len(generated_audio_files) != 0:
-        audio_length_in_s = get_audio_length(generated_audio_files[-1])
         sample_rate = 24000
-        waveform, sr = torchaudio.load(generated_audio_files[-1])
         if sample_rate != sr:
             waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=sample_rate)
         audio = torch.mean(waveform, 0)
@@ -259,7 +259,6 @@ def predict(
         print(f"Video Length: {audio_length_in_s}")
     if audio_path is not None:
         audio_length_in_s = get_audio_length(audio_path)
-        generated_audio_files.append(audio_path)
         print(f"Audio Length: {audio_length_in_s}")
     print(image, video, audio)

 import av
 import subprocess
 import librosa
+import re
 args = {"model": "./ckpts/checkpoint.pth", "llama_type": "7B", "llama_dir": "./ckpts/LLaMA-2",
         "mert_path": "m-a-p/MERT-v1-330M", "vit_path": "google/vit-base-patch16-224", "vivit_path": "google/vivit-b-16x2-kinetics400",
 args = dotdict(args)
 llama_type = args.llama_type
 llama_ckpt_dir = os.path.join(args.llama_dir, llama_type)
 llama_tokenzier_path = args.llama_dir
 def save_audio_to_local(audio, sec):
     if not os.path.exists('temp'):
         os.mkdir('temp')
     filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.wav')
         scipy.io.wavfile.write(filename, rate=16000, data=audio[0])
     else:
         scipy.io.wavfile.write(filename, rate=model.generation_model.config.audio_encoder.sampling_rate, data=audio)
     return filename
 def reset_state():
     return None, None, None, None, [], [], []
 def get_audio_length(filename):
     return int(round(librosa.get_duration(path=filename)))
+def get_last_audio():
+    for hist in history[::-1]:
+        print(hist)
+        if "<audio controls playsinline>" in hist[1]:
+            return re.search('<audio controls playsinline><source src=\"\.\/file=(.*)\" type="audio\/wav"><\/audio>', hist[1]).group(1)
+    return None
 def predict(
         prompt_input,
         history,
         modality_cache,
         audio_length_in_s):
     prompts = [llama.format_prompt(prompt_input)]
     prompts = [model.tokenizer(x).input_ids for x in prompts]
     print(image_path, audio_path, video_path)
         container = av.open(video_path)
         indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
         video = read_video_pyav(container=container, indices=indices)
+    generated_audio_file = get_last_audio()
+    if generated_audio_file is not None:
+        audio_length_in_s = get_audio_length(generated_audio_file)
         sample_rate = 24000
+        waveform, sr = torchaudio.load(generated_audio_file)
         if sample_rate != sr:
             waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=sample_rate)
         audio = torch.mean(waveform, 0)
         print(f"Video Length: {audio_length_in_s}")
     if audio_path is not None:
         audio_length_in_s = get_audio_length(audio_path)
         print(f"Audio Length: {audio_length_in_s}")
     print(image, video, audio)