zahoor54321 commited on
Commit
7ec5198
1 Parent(s): 5916b45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -24
app.py CHANGED
@@ -1,25 +1,77 @@
1
- import torch
2
- import torchaudio
 
 
3
  import gradio as gr
4
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
5
-
6
- # Load the custom model from Hugging Face Spaces
7
- model_name = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
8
- model = Wav2Vec2ForCTC.from_pretrained(model_name)
9
- processor = Wav2Vec2Processor.from_pretrained(model_name)
10
-
11
- # Define the transcribe function
12
- def transcribe(audio):
13
- waveform, sample_rate = torchaudio.load(audio, normalize=True)
14
- input_dict = processor(waveform, return_tensors="pt", padding=True)
15
- logits = model(input_dict.input_values).logits
16
- predicted_ids = torch.argmax(logits, dim=-1).squeeze()
17
- transcription = processor.decode(predicted_ids)
18
- return transcription
19
-
20
- # Define the interface
21
- audio_input = gr.inputs.Audio(source="microphone", type="numpy", label="Speak or Upload Audio")
22
- text_output = gr.outputs.Textbox(label="Transcription")
23
-
24
- interface = gr.Interface(fn=transcribe, inputs=audio_input, outputs=text_output, title="Speech Recognition")
25
- interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import unicodedata
3
+ from datasets import load_dataset, Audio
4
+ from transformers import pipeline
5
  import gradio as gr
6
+ import torch
7
+
8
+ ############### HF ###########################
9
+
10
+ HF_TOKEN = os.getenv("HF_TOKEN")
11
+
12
+ hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, "Urdu-ASR-flags")
13
+
14
+ ############## DagsHub ################################
15
+
16
+ Model = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
17
+ # This is not working because Huggingface has completely changed the git server.
18
+ # from dagshub.streaming import install_hooks
19
+ # install_hooks()
20
+
21
+ ############## Inference ##############################
22
+
23
+
24
+ def asr(audio):
25
+
26
+ asr = pipeline("automatic-speech-recognition", model=Model)
27
+ prediction = asr(audio, chunk_length_s=30)
28
+ return unicodedata.normalize("NFC",prediction["text"])
29
+
30
+
31
+ ################### Gradio Web APP ################################
32
+
33
+ title = "Urdu Automatic Speech Recognition"
34
+
35
+ description = """
36
+ <p>
37
+ <center>
38
+ This model is a fine-tuned version of facebook/wav2vec2-xls-r-300m on the common_voice dataset.
39
+ <img src="https://huggingface.co/spaces/kingabzpro/Urdu-ASR-SOTA/resolve/main/Images/cover.jpg" alt="logo" width="550"/>
40
+ </center>
41
+ </p>
42
+ """
43
+
44
+ article = "<p style='text-align: center'><a href='https://dagshub.com/kingabzpro/Urdu-ASR-SOTA' target='_blank'>Source Code on DagsHub</a></p><p style='text-align: center'><a href='https://huggingface.co/blog/fine-tune-xlsr-wav2vec2' target='_blank'>Fine-tuning XLS-R for Multi-Lingual ASR with 🤗 Transformers</a></p></center><center><img src='https://visitor-badge.glitch.me/badge?page_id=kingabzpro/Urdu-ASR-SOTA' alt='visitor badge'></center></p>"
45
+
46
+ examples = [["Sample/sample1.mp3"], ["Sample/sample2.mp3"], ["Sample/sample3.mp3"]]
47
+
48
+
49
+ Input = gr.Audio(
50
+ source="microphone",
51
+ type="filepath",
52
+ label="Please Record Your Voice",
53
+ )
54
+ Output = gr.Textbox(label="Urdu Script")
55
+
56
+
57
+ def main():
58
+ iface = gr.Interface(
59
+ asr,
60
+ Input,
61
+ Output,
62
+ title=title,
63
+ allow_flagging="manual",
64
+ flagging_callback=hf_writer,
65
+ description=description,
66
+ article=article,
67
+ examples=examples,
68
+ theme='JohnSmith9982/small_and_pretty'
69
+ )
70
+
71
+ iface.launch(enable_queue=True)
72
+
73
+
74
+ # enable_queue=True,auth=("admin", "pass1234")
75
+
76
+ if __name__ == "__main__":
77
+ main()