Spaces:
Runtime error
Runtime error
zahoor54321
commited on
Commit
•
8adf940
1
Parent(s):
4fe4722
Update app.py
Browse files
app.py
CHANGED
@@ -1,38 +1,25 @@
|
|
1 |
-
|
|
|
2 |
import gradio as gr
|
3 |
-
import
|
4 |
-
import unicodedata
|
5 |
|
6 |
-
|
|
|
|
|
|
|
7 |
|
|
|
8 |
def transcribe(audio):
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
11 |
return transcription
|
12 |
|
13 |
-
|
|
|
14 |
text_output = gr.outputs.Textbox(label="Transcription")
|
15 |
|
16 |
-
interface = gr.Interface(
|
17 |
-
fn=transcribe,
|
18 |
-
inputs=audio_input,
|
19 |
-
outputs=text_output,
|
20 |
-
title="Real-Time Urdu ASR",
|
21 |
-
description="""
|
22 |
-
<p>
|
23 |
-
<center>
|
24 |
-
This model is a fine-tuned version of facebook/wav2vec2-xls-r-300m on the common_voice dataset.
|
25 |
-
</center>
|
26 |
-
</p>
|
27 |
-
<center>
|
28 |
-
<img src="https://huggingface.co/spaces/kingabzpro/real-time-Urdu-ASR/resolve/main/Images/cover.jpg" alt="logo" width="550"/>
|
29 |
-
</center>
|
30 |
-
""",
|
31 |
-
article="""
|
32 |
-
<p style='text-align: center'><a href='https://dagshub.com/kingabzpro/Urdu-ASR-SOTA' target='_blank'>Source Code on DagsHub</a></p><p style='text-align: center'><a href='https://huggingface.co/blog/fine-tune-xlsr-wav2vec2' target='_blank'>Fine-tuning XLS-R for Multi-Lingual ASR with 🤗 Transformers</a></p></center><center><img src='https://visitor-badge.glitch.me/badge?page_id=kingabzpro/real-time-Urdu-ASR' alt='visitor badge'></center></p>
|
33 |
-
""",
|
34 |
-
theme='EveryPizza/Cartoony-Gradio-Theme',
|
35 |
-
live=True
|
36 |
-
)
|
37 |
-
|
38 |
interface.launch()
|
|
|
1 |
+
import torch
|
2 |
+
import torchaudio
|
3 |
import gradio as gr
|
4 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
|
|
5 |
|
6 |
+
# Load the custom model from Hugging Face Spaces
|
7 |
+
model_name = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
|
8 |
+
model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
9 |
+
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
10 |
|
11 |
+
# Define the transcribe function
|
12 |
def transcribe(audio):
|
13 |
+
waveform, sample_rate = torchaudio.load(audio, normalize=True)
|
14 |
+
input_dict = processor(waveform, return_tensors="pt", padding=True)
|
15 |
+
logits = model(input_dict.input_values).logits
|
16 |
+
predicted_ids = torch.argmax(logits, dim=-1).squeeze()
|
17 |
+
transcription = processor.decode(predicted_ids)
|
18 |
return transcription
|
19 |
|
20 |
+
# Define the interface
|
21 |
+
audio_input = gr.inputs.Audio(source="microphone", type="numpy", label="Speak or Upload Audio")
|
22 |
text_output = gr.outputs.Textbox(label="Transcription")
|
23 |
|
24 |
+
interface = gr.Interface(fn=transcribe, inputs=audio_input, outputs=text_output, title="Speech Recognition")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
interface.launch()
|