Spaces:
badrex
/
Running on Zero

badrex commited on
Commit
27414b4
Β·
verified Β·
1 Parent(s): 5d9d1f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -62
app.py CHANGED
@@ -1,77 +1,106 @@
1
- import gradio as gr
2
- from transformers import pipeline
3
- import numpy as np
4
  import os
5
- from huggingface_hub import login
6
- import librosa
7
  import spaces
 
 
8
 
9
- HF_TOKEN = os.environ.get("HF_TOKEN")
10
- if HF_TOKEN:
11
- login(token=HF_TOKEN)
12
-
13
- MODEL_ID = "badrex/w2v-bert-2.0-zulu-asr"
14
- transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)
15
-
16
-
17
- @spaces.GPU
18
- def transcribe(audio):
19
- sr, y = audio
20
-
21
- # convert to mono if stereo
22
- if y.ndim > 1:
23
- y = y.mean(axis=1)
24
-
25
- # resample to 16kHz if needed
26
- if sr != 16000:
27
- y = librosa.resample(y, orig_sr=sr, target_sr=16000)
28
-
29
- y = y.astype(np.float32)
30
- y /= np.max(np.abs(y))
31
-
32
- return transcriber({"sampling_rate": sr, "raw": y})["text"]
33
 
 
34
  examples = []
35
  examples_dir = "examples"
36
  if os.path.exists(examples_dir):
37
  for filename in os.listdir(examples_dir):
38
  if filename.endswith((".wav", ".mp3", ".ogg")):
39
  examples.append([os.path.join(examples_dir, filename)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- print(f"Found {len(examples)} example files")
42
- else:
43
- print("Examples directory not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- demo = gr.Interface(
47
- fn=transcribe,
48
- inputs=gr.Audio(),
49
- outputs="text",
50
- title="<div>Zulu ASR πŸŽ™οΈ <br>Robust Speech Recognition for Zulu</div>",
51
- description="""
52
- <div class="centered-content">
53
- <div>
54
- <p>
55
- Developed with ❀ by <a href="https://badrex.github.io/" style="color: #2563eb;">Badr al-Absi</a> β˜•
56
- </p>
57
- <br>
58
- <p style="font-size: 15px; line-height: 1.8;">
59
- Hi there πŸ‘‹πŸΌ
60
- <br>
61
- <br>
62
- This is a demo for <a href="https://huggingface.co/badrex/w2v-bert-2.0-zulu-asr" style="color: #2563eb;">badrex/w2v-bert-2.0-zulu-asr</a>, a robust Transformer-based automatic speech recognition (ASR) system for the Zulu language, a Bantu language spoken in South Africa.
63
- The underlying ASR model was trained on 250 hours of high-quality human-transcribed speech based on the <a href="https://huggingface.co/datasets/dsfsi-anv/za-african-next-voices" style="color: #2563eb;">Swivuriso: ZA-African Next Voices</a> dataset.
64
- <br>
65
- <p style="font-size: 15px; line-height: 1.8;">
66
- Simply <strong>upload an audio file</strong> πŸ“€ or <strong>record yourself speaking</strong> πŸŽ™οΈβΊοΈ to try out the model!
67
- </p>
68
- </div>
69
- </div>
70
- """,
71
- examples=examples if examples else None,
72
- cache_examples=False,
73
- flagging_mode=None,
74
- )
75
 
 
76
  if __name__ == "__main__":
77
- demo.launch()
 
 
 
 
1
  import os
2
+ import torchaudio
3
+ import gradio as gr
4
  import spaces
5
+ import torch
6
+ from transformers import AutoProcessor, AutoModelForCTC
7
 
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ print(f"Using device: {device}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ # load examples
12
  examples = []
13
  examples_dir = "examples"
14
  if os.path.exists(examples_dir):
15
  for filename in os.listdir(examples_dir):
16
  if filename.endswith((".wav", ".mp3", ".ogg")):
17
  examples.append([os.path.join(examples_dir, filename)])
18
+
19
+ # Load model and processor
20
+ MODEL_PATH = "badrex/w2v-bert-2.0-zulu-asr"
21
+ processor = AutoProcessor.from_pretrained(MODEL_PATH)
22
+ model = AutoModelForCTC.from_pretrained(MODEL_PATH)
23
+
24
+ # move model and processor to device
25
+ model = model.to(device)
26
+ #processor = processor.to(device)
27
+
28
+ @spaces.GPU()
29
+ def process_audio(audio_path):
30
+ """Process audio with return the generated respotextnse.
31
+
32
+ Args:
33
+ audio_path: Path to the audio file to be transcribed.
34
+ Returns:
35
+ String containing the transcribed text from the audio file, or an error message
36
+ if the audio file is missing.
37
+ """
38
+ if not audio_path:
39
+ return "Please upload an audio file."
40
+
41
+ # get audio array
42
+ audio_array, sample_rate = torchaudio.load(audio_path)
43
+
44
+ # if sample rate is not 16000, resample to 16000
45
+ if sample_rate != 16000:
46
+ audio_array = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_array)
47
+
48
+ #audio_array = audio_array.to(device)
49
+
50
+ inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")
51
+ inputs = {k: v.to(device) for k, v in inputs.items()}
52
+
53
+ #inputs = inputs.to(device, dtype=torch.bfloat16)
54
+
55
+ with torch.no_grad():
56
+ logits = model(**inputs).logits
57
+
58
+ outputs = torch.argmax(logits, dim=-1)
59
 
60
+ decoded_outputs = processor.batch_decode(
61
+ outputs,
62
+ skip_special_tokens=True
63
+ )
64
+
65
+ return decoded_outputs[0].strip()
66
+
67
+
68
+ # Define Gradio interface
69
+ with gr.Blocks(title="Voxtral Demo") as demo:
70
+ gr.Markdown("# isiZulu ASR πŸŽ™οΈ Robust Speech Recognition for Zulu Language πŸ‹β€πŸŸ©")
71
+ gr.Markdown(
72
+ 'Developed with <span style="color:red;">❀</span> by <a href="https://badrex.github.io/">Badr al-Absi</a>'
73
+ )
74
+ gr.Markdown(
75
+ """### Hi there πŸ‘‹πŸΌ
76
 
77
+ This is a demo for [badrex/w2v-bert-2.0-zulu-asr](https://huggingface.co/badrex/w2v-bert-2.0-zulu-asr),
78
+ a robust Transformer-based automatic speech recognition (ASR) system for the Zulu language that was trained on 250+ hours of
79
+ high-quality human-transcribed speech based on the [ZA-African Next Voices](https://huggingface.co/datasets/dsfsi-anv/za-african-next-voices) dataset.
80
+ """
81
+ )
82
+
83
+ gr.Markdown("Simply **upload an audio file** πŸ“€ or **record yourself speaking** πŸŽ™οΈβΊοΈ to try out the model!")
84
+
85
+ with gr.Row():
86
+ with gr.Column():
87
+ audio_input = gr.Audio(type="filepath", label="Upload Audio")
88
+ submit_btn = gr.Button("Transcribe Audio", variant="primary")
89
+
90
+ with gr.Column():
91
+ output_text = gr.Textbox(label="Text Transcription", lines=10)
92
 
93
+ submit_btn.click(
94
+ fn=process_audio,
95
+ inputs=[audio_input],
96
+ outputs=output_text
97
+ )
98
+
99
+ gr.Examples(
100
+ examples=examples if examples else None,
101
+ inputs=[audio_input],
102
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ # Launch the app
105
  if __name__ == "__main__":
106
+ demo.queue().launch() #share=False, ssr_mode=False, mcp_server=True