Spaces:

lyhourt
/

whisper-small-hu-demo

Sleeping

lyhourt commited on May 24

Commit

c1b9b25

•

1 Parent(s): d5d2a64

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 model_id = "lyhourt/whisper-small-clean_6-v4"
@@ -18,16 +19,28 @@ pipe = pipeline(
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
     max_new_tokens=128,
-    chunk_length_s=30,
     batch_size=16,
     return_timestamps=True,
     torch_dtype=torch_dtype,
     device=device,
 )
-def transcribe(audio):
-    text = pipe(audio)["text"]
-    return text
 iface = gr.Interface(
     fn=transcribe,

 import gradio as gr
 import torch
+import torchaudio
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 model_id = "lyhourt/whisper-small-clean_6-v4"
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
     max_new_tokens=128,
+    chunk_length_s=30,  # You can increase this if needed
     batch_size=16,
     return_timestamps=True,
     torch_dtype=torch_dtype,
     device=device,
 )
+def transcribe(audio_path):
+    waveform, sample_rate = torchaudio.load(audio_path)
+    # Split the audio into chunks of 30 seconds (or your desired chunk length)
+    chunk_length = 30 * sample_rate  # 30 seconds
+    chunks = [waveform[:, i:i + chunk_length] for i in range(0, waveform.size(1), chunk_length)]
+    texts = []
+    for chunk in chunks:
+        chunk = chunk.to(device)
+        text = pipe(chunk)["text"]
+        texts.append(text)
+    # Concatenate all texts
+    full_text = " ".join(texts)
+    return full_text
 iface = gr.Interface(
     fn=transcribe,