Spaces:

ACloudCenter
/

canary-qwen-transcriber-2.5b

Running on Zero

ACloudCenter commited on Aug 26

Commit

e96a4b0

1 Parent(s): 1bfebf7

fix: Use batch loading with dynamiccutsampler to avoid audio shape errors. Borrow from Nvidia example to test

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import torch
 import spaces
 from lhotse import Recording
 from nemo.collections.speechlm2 import SALM
 # Set device to use cuda if available and sample rate to 16000 for Nvidia NeMo
@@ -26,15 +27,16 @@ def transcribe_audio(audio_filepath):
         cut = cut.to_mono(mono_downmix=True)
     # Load audio data
-    audio = cut.load_audio()
-    audio_lens = audio.shape[0]
     # Generate transcription
     with torch.inference_mode():
         output_ids = model.generate(
-            prompts=[[{"role": "user", "content": f"Transcribe the following: {model.audio_locator_tag}"}]], # torch.as_tensor is used to convert the audio data to a tensor for model input
-            audios=torch.as_tensor(audio).unsqueeze(0).to(device),
-            audio_lens=torch.as_tensor([audio_lens]).to(device), # torch.as_tensor is used to convert the audio length to a tensor for model input
             max_new_tokens=256, # Maximum number of tokens to generate
         )

 import torch
 import spaces
 from lhotse import Recording
+from lhotse.dataset import DynamicCutSampler
 from nemo.collections.speechlm2 import SALM
 # Set device to use cuda if available and sample rate to 16000 for Nvidia NeMo
         cut = cut.to_mono(mono_downmix=True)
     # Load audio data
+    batch = DynamicCutSampler([cut], max_cuts=1)
+    for b in batch:
+        audio, audio_lens = b.load_audio(collate=True)
     # Generate transcription
     with torch.inference_mode():
         output_ids = model.generate(
+            prompts=[[{"role": "user", "content": f"Transcribe the following: {model.audio_locator_tag}"}]],
+            audios=torch.as_tensor(audio).to(device),
+            audio_lens=torch.as_tensor(audio_lens).to(device),
             max_new_tokens=256, # Maximum number of tokens to generate
         )