Spaces:
Running
Running
Irpan
commited on
Commit
·
bef8623
1
Parent(s):
ef107e3
asr
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import asr
|
|
|
3 |
# from tts import synthesize
|
4 |
|
5 |
|
@@ -24,31 +25,25 @@ mms_transcribe = gr.Interface(
|
|
24 |
allow_flagging="never",
|
25 |
)
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
#
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
# )
|
41 |
|
42 |
tabbed_interface = gr.TabbedInterface(
|
43 |
-
[mms_transcribe],
|
44 |
-
["Speech-to-text"],
|
45 |
)
|
46 |
|
47 |
-
# tabbed_interface = gr.TabbedInterface(
|
48 |
-
# [mms_transcribe, mms_synthesize],
|
49 |
-
# ["Speech-to-text", "Text-to-speech"],
|
50 |
-
# )
|
51 |
-
|
52 |
with gr.Blocks() as demo:
|
53 |
tabbed_interface.render()
|
54 |
|
|
|
1 |
import gradio as gr
|
2 |
import asr
|
3 |
+
import tts
|
4 |
# from tts import synthesize
|
5 |
|
6 |
|
|
|
25 |
allow_flagging="never",
|
26 |
)
|
27 |
|
28 |
+
mms_synthesize = gr.Interface(
|
29 |
+
fn=tts.synthesize,
|
30 |
+
inputs=[
|
31 |
+
gr.Text(label="Input text"),
|
32 |
+
],
|
33 |
+
outputs=[
|
34 |
+
gr.Audio(label="Generated Audio", type="numpy"),
|
35 |
+
],
|
36 |
+
#examples=TTS_EXAMPLES,
|
37 |
+
title="Text-to-speech",
|
38 |
+
description=("Generate audio from input text."),
|
39 |
+
allow_flagging="never",
|
40 |
+
)
|
|
|
41 |
|
42 |
tabbed_interface = gr.TabbedInterface(
|
43 |
+
[mms_transcribe, mms_synthesize],
|
44 |
+
["Speech-to-text", "Text-to-speech"],
|
45 |
)
|
46 |
|
|
|
|
|
|
|
|
|
|
|
47 |
with gr.Blocks() as demo:
|
48 |
tabbed_interface.render()
|
49 |
|
asr.py
CHANGED
@@ -12,7 +12,7 @@ import numpy as np
|
|
12 |
|
13 |
# Load processor and model
|
14 |
models_info = {
|
15 |
-
"
|
16 |
"processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
|
17 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
|
18 |
"ctc_model": False
|
@@ -27,7 +27,7 @@ models_info = {
|
|
27 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
28 |
"ctc_model": False
|
29 |
},
|
30 |
-
"
|
31 |
"processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
|
32 |
"model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
|
33 |
"ctc_model": True
|
|
|
12 |
|
13 |
# Load processor and model
|
14 |
models_info = {
|
15 |
+
"OpenAI-Whisper-Uzbek": {
|
16 |
"processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
|
17 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
|
18 |
"ctc_model": False
|
|
|
27 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
28 |
"ctc_model": False
|
29 |
},
|
30 |
+
"Meta-MMS": {
|
31 |
"processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
|
32 |
"model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
|
33 |
"ctc_model": True
|
tts.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import VitsModel, AutoTokenizer
|
2 |
+
import torch
|
3 |
+
|
4 |
+
# Load processor and model
|
5 |
+
models_info = {
|
6 |
+
"Meta-MMS": {
|
7 |
+
"processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
8 |
+
"model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
9 |
+
},
|
10 |
+
}
|
11 |
+
|
12 |
+
def synthesize(text, model_id):
|
13 |
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
14 |
+
processor = models_info[model_id]["processor"]
|
15 |
+
model = models_info[model_id]["model"].to(device)
|
16 |
+
inputs = processor(text, return_tensors="pt").to(device)
|
17 |
+
|
18 |
+
with torch.no_grad():
|
19 |
+
output = model(**inputs).waveform
|
20 |
+
|
21 |
+
sampling_rate = 22050
|
22 |
+
|
23 |
+
return (output.cpu(), sampling_rate)
|