Irpan commited on
Commit
bef8623
·
1 Parent(s): ef107e3
Files changed (3) hide show
  1. app.py +16 -21
  2. asr.py +2 -2
  3. tts.py +23 -0
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import asr
 
3
  # from tts import synthesize
4
 
5
 
@@ -24,31 +25,25 @@ mms_transcribe = gr.Interface(
24
  allow_flagging="never",
25
  )
26
 
27
- # mms_synthesize = gr.Interface(
28
- # fn=synthesize,
29
- # inputs=[
30
- # gr.Text(label="Input text"),
31
- # ],
32
- # outputs=[
33
- # gr.Audio(label="Generated Audio", type="numpy"),
34
- # gr.Text(label="Filtered text after removing OOVs"),
35
- # ],
36
- # #examples=TTS_EXAMPLES,
37
- # title="Text-to-speech",
38
- # description=("Generate audio from input text."),
39
- # allow_flagging="never",
40
- # )
41
 
42
  tabbed_interface = gr.TabbedInterface(
43
- [mms_transcribe],
44
- ["Speech-to-text"],
45
  )
46
 
47
- # tabbed_interface = gr.TabbedInterface(
48
- # [mms_transcribe, mms_synthesize],
49
- # ["Speech-to-text", "Text-to-speech"],
50
- # )
51
-
52
  with gr.Blocks() as demo:
53
  tabbed_interface.render()
54
 
 
1
  import gradio as gr
2
  import asr
3
+ import tts
4
  # from tts import synthesize
5
 
6
 
 
25
  allow_flagging="never",
26
  )
27
 
28
+ mms_synthesize = gr.Interface(
29
+ fn=tts.synthesize,
30
+ inputs=[
31
+ gr.Text(label="Input text"),
32
+ ],
33
+ outputs=[
34
+ gr.Audio(label="Generated Audio", type="numpy"),
35
+ ],
36
+ #examples=TTS_EXAMPLES,
37
+ title="Text-to-speech",
38
+ description=("Generate audio from input text."),
39
+ allow_flagging="never",
40
+ )
 
41
 
42
  tabbed_interface = gr.TabbedInterface(
43
+ [mms_transcribe, mms_synthesize],
44
+ ["Speech-to-text", "Text-to-speech"],
45
  )
46
 
 
 
 
 
 
47
  with gr.Blocks() as demo:
48
  tabbed_interface.render()
49
 
asr.py CHANGED
@@ -12,7 +12,7 @@ import numpy as np
12
 
13
  # Load processor and model
14
  models_info = {
15
- "openai/whisper-small-uzbek": {
16
  "processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
17
  "model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
18
  "ctc_model": False
@@ -27,7 +27,7 @@ models_info = {
27
  "model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
28
  "ctc_model": False
29
  },
30
- "facebook/mms-1b-all": {
31
  "processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
32
  "model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
33
  "ctc_model": True
 
12
 
13
  # Load processor and model
14
  models_info = {
15
+ "OpenAI-Whisper-Uzbek": {
16
  "processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
17
  "model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
18
  "ctc_model": False
 
27
  "model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
28
  "ctc_model": False
29
  },
30
+ "Meta-MMS": {
31
  "processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
32
  "model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
33
  "ctc_model": True
tts.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import VitsModel, AutoTokenizer
2
+ import torch
3
+
4
+ # Load processor and model
5
+ models_info = {
6
+ "Meta-MMS": {
7
+ "processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"),
8
+ "model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
9
+ },
10
+ }
11
+
12
+ def synthesize(text, model_id):
13
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
14
+ processor = models_info[model_id]["processor"]
15
+ model = models_info[model_id]["model"].to(device)
16
+ inputs = processor(text, return_tensors="pt").to(device)
17
+
18
+ with torch.no_grad():
19
+ output = model(**inputs).waveform
20
+
21
+ sampling_rate = 22050
22
+
23
+ return (output.cpu(), sampling_rate)