Zulu-ASR

Running on Zero

App Files Files Community

badrex commited on Oct 20

Commit

27414b4

verified ·

1 Parent(s): 5d9d1f9

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -62

app.py CHANGED Viewed

@@ -1,77 +1,106 @@
-import gradio as gr
-from transformers import pipeline
-import numpy as np
 import os
-from huggingface_hub import login
-import librosa
 import spaces
-HF_TOKEN = os.environ.get("HF_TOKEN")
-if HF_TOKEN:
-    login(token=HF_TOKEN)
-MODEL_ID = "badrex/w2v-bert-2.0-zulu-asr"
-transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)
-@spaces.GPU
-def transcribe(audio):
-    sr, y = audio
-    # convert to mono if stereo
-    if y.ndim > 1:
-        y = y.mean(axis=1)
-    # resample to 16kHz if needed
-    if sr != 16000:
-        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y))
-    return transcriber({"sampling_rate": sr, "raw": y})["text"]
 examples = []
 examples_dir = "examples"
 if os.path.exists(examples_dir):
     for filename in os.listdir(examples_dir):
         if filename.endswith((".wav", ".mp3", ".ogg")):
             examples.append([os.path.join(examples_dir, filename)])
-    print(f"Found {len(examples)} example files")
-else:
-    print("Examples directory not found")
-demo = gr.Interface(
-    fn=transcribe,
-    inputs=gr.Audio(),
-    outputs="text",
-    title="<div>Zulu ASR 🎙️ <br>Robust Speech Recognition for Zulu</div>",
-    description="""
-        <div class="centered-content">
-            <div>
-                <p>
-                Developed with ❤ by <a href="https://badrex.github.io/" style="color: #2563eb;">Badr al-Absi</a> ☕
-                </p>
-                <br>
-                <p style="font-size: 15px; line-height: 1.8;">
-                 Hi there 👋🏼
-                <br>
-                <br>
-                 This is a demo for <a href="https://huggingface.co/badrex/w2v-bert-2.0-zulu-asr" style="color: #2563eb;">badrex/w2v-bert-2.0-zulu-asr</a>, a robust Transformer-based automatic speech recognition (ASR) system for the Zulu language, a Bantu language spoken in South Africa.
-                 The underlying ASR model was trained on 250 hours of high-quality human-transcribed speech based on the <a href="https://huggingface.co/datasets/dsfsi-anv/za-african-next-voices" style="color: #2563eb;">Swivuriso: ZA-African Next Voices</a> dataset.
-                <br>
-                <p style="font-size: 15px; line-height: 1.8;">
-                Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
-                </p>
-            </div>
-        </div>
-        """,
-    examples=examples if examples else None,
-    cache_examples=False,
-    flagging_mode=None,
-)
 if __name__ == "__main__":
-    demo.launch()

 import os
+import torchaudio
+import gradio as gr
 import spaces
+import torch
+from transformers import AutoProcessor, AutoModelForCTC
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# load examples
 examples = []
 examples_dir = "examples"
 if os.path.exists(examples_dir):
     for filename in os.listdir(examples_dir):
         if filename.endswith((".wav", ".mp3", ".ogg")):
             examples.append([os.path.join(examples_dir, filename)])
+# Load model and processor
+MODEL_PATH = "badrex/w2v-bert-2.0-zulu-asr"
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+model = AutoModelForCTC.from_pretrained(MODEL_PATH)
+# move model and processor to device
+model = model.to(device)
+#processor = processor.to(device)
+@spaces.GPU()
+def process_audio(audio_path):
+    """Process audio with return the generated respotextnse.
+    Args:
+        audio_path: Path to the audio file to be transcribed.
+    Returns:
+        String containing the transcribed text from the audio file, or an error message
+        if the audio file is missing.
+    """
+    if not audio_path:
+        return "Please upload an audio file."
+    # get audio array
+    audio_array, sample_rate = torchaudio.load(audio_path)
+    # if sample rate is not 16000, resample to 16000
+    if sample_rate != 16000:
+        audio_array = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_array)
+    #audio_array = audio_array.to(device)
+    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    #inputs = inputs.to(device, dtype=torch.bfloat16)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    outputs = torch.argmax(logits, dim=-1)
+    decoded_outputs = processor.batch_decode(
+        outputs,
+        skip_special_tokens=True
+    )
+    return decoded_outputs[0].strip()
+# Define Gradio interface
+with gr.Blocks(title="Voxtral Demo") as demo:
+    gr.Markdown("# isiZulu ASR 🎙️ Robust Speech Recognition for Zulu Language 🍋‍🟩")
+    gr.Markdown(
+        'Developed with <span style="color:red;">❤</span> by <a href="https://badrex.github.io/">Badr al-Absi</a>'
+    )
+    gr.Markdown(
+        """### Hi there 👋🏼
+This is a demo for [badrex/w2v-bert-2.0-zulu-asr](https://huggingface.co/badrex/w2v-bert-2.0-zulu-asr),
+a robust Transformer-based automatic speech recognition (ASR) system for the Zulu language that was trained on 250+ hours of
+high-quality human-transcribed speech based on the [ZA-African Next Voices](https://huggingface.co/datasets/dsfsi-anv/za-african-next-voices) dataset.
+    """
+    )
+    gr.Markdown("Simply **upload an audio file** 📤 or **record yourself speaking** 🎙️⏺️ to try out the model!")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(type="filepath", label="Upload Audio")
+            submit_btn = gr.Button("Transcribe Audio", variant="primary")
+        with gr.Column():
+            output_text = gr.Textbox(label="Text Transcription", lines=10)
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[audio_input],
+        outputs=output_text
+    )
+    gr.Examples(
+        examples=examples if examples else None,
+        inputs=[audio_input],
+    )
+# Launch the app
 if __name__ == "__main__":
+    demo.queue().launch() #share=False, ssr_mode=False, mcp_server=True