Spaces:
Running
Running
Commit
·
040ebdb
1
Parent(s):
be60470
Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,8 @@
|
|
1 |
import os
|
2 |
-
os.system("pip install numpy==1.
|
3 |
os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
|
4 |
os.system("pip install torch accelerate torchaudio datasets librosa easymms")
|
5 |
|
6 |
-
#Transformers have a bug somewhere that conflicts with Numpy v1.19.0 and above.
|
7 |
-
|
8 |
import gradio as gr
|
9 |
from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor
|
10 |
from datasets import load_dataset, Audio, Dataset
|
@@ -12,14 +10,14 @@ import torch
|
|
12 |
import librosa #For converting audio sample rate to 16k
|
13 |
from easymms.models.tts import TTSModel #For TTS inference using EasyMMS
|
14 |
|
15 |
-
|
16 |
model_id = "facebook/mms-1b-all"
|
17 |
|
18 |
#Set target language to dtp (Kadazandusun)
|
19 |
processor = AutoProcessor.from_pretrained(model_id)
|
20 |
model = Wav2Vec2ForCTC.from_pretrained(model_id).to("cpu")
|
21 |
-
processor.tokenizer.set_target_lang(
|
22 |
-
model.load_adapter(
|
23 |
|
24 |
asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"
|
25 |
|
@@ -30,21 +28,18 @@ def preprocess(input): #Sets recording sampling rate to 16k and returns numpy nd
|
|
30 |
audio_to_array = loaded_audio[0]["audio"]["array"]
|
31 |
return audio_to_array
|
32 |
|
33 |
-
def transcribe(input): #Gradio UI wrapper function
|
34 |
-
audioarray = preprocess(input) #Call preprocessor function
|
35 |
-
out = run(audioarray)
|
36 |
-
return out
|
37 |
-
#transcription = asr_pipeline(audioarray)
|
38 |
-
#return transcription["text"]
|
39 |
-
|
40 |
def run(input):
|
41 |
-
inputs = processor(input, sampling_rate=16_000, return_tensors="pt")
|
42 |
with torch.no_grad():
|
43 |
outputs = model(**inputs).logits
|
44 |
ids = torch.argmax(outputs, dim=-1)[0]
|
45 |
transcription = processor.decode(ids)
|
46 |
return transcription
|
47 |
|
|
|
|
|
|
|
|
|
48 |
|
49 |
with gr.Blocks(theme = gr.themes.Soft()) as demo:
|
50 |
gr.HTML(
|
@@ -68,7 +63,7 @@ with gr.Blocks(theme = gr.themes.Soft()) as demo:
|
|
68 |
</div></h6>
|
69 |
""")
|
70 |
|
71 |
-
tts = TTSModel(
|
72 |
|
73 |
def fn2(input):
|
74 |
res = tts.synthesize(input)
|
@@ -87,11 +82,10 @@ with gr.Blocks(theme = gr.themes.Soft()) as demo:
|
|
87 |
""")
|
88 |
with gr.Column(scale = 4):
|
89 |
with gr.Tab("Rolou kumaa ginarit"):
|
90 |
-
#input = gr.components.Textbox(placeholder = "Potutakai suat nu hiti | Type something here")
|
91 |
input = gr.components.Audio(source = "microphone", label = "Gakamai rolou nu")
|
92 |
output = gr.components.Textbox(label = "Dalinsuat")
|
93 |
button1 = gr.Button("Dalinsuato' | Transcribe")
|
94 |
-
button1.click(
|
95 |
|
96 |
with gr.Tab("Ginarit kumaa rolou"):
|
97 |
input = gr.components.Textbox(label = "Ginarit", placeholder = "Potutakai suat nu hiti")
|
|
|
1 |
import os
|
2 |
+
os.system("pip install numpy==1.23.0") #NumPy 1.24 or less needed by Numba. Use 1.23, librosa still uses np.complex which was dropped in NumPy 1.24
|
3 |
os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
|
4 |
os.system("pip install torch accelerate torchaudio datasets librosa easymms")
|
5 |
|
|
|
|
|
6 |
import gradio as gr
|
7 |
from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor
|
8 |
from datasets import load_dataset, Audio, Dataset
|
|
|
10 |
import librosa #For converting audio sample rate to 16k
|
11 |
from easymms.models.tts import TTSModel #For TTS inference using EasyMMS
|
12 |
|
13 |
+
LANG = "dtp"
|
14 |
model_id = "facebook/mms-1b-all"
|
15 |
|
16 |
#Set target language to dtp (Kadazandusun)
|
17 |
processor = AutoProcessor.from_pretrained(model_id)
|
18 |
model = Wav2Vec2ForCTC.from_pretrained(model_id).to("cpu")
|
19 |
+
processor.tokenizer.set_target_lang(LANG)
|
20 |
+
model.load_adapter(LANG)
|
21 |
|
22 |
asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"
|
23 |
|
|
|
28 |
audio_to_array = loaded_audio[0]["audio"]["array"]
|
29 |
return audio_to_array
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
def run(input):
|
32 |
+
inputs = processor(input, sampling_rate=16_000, return_tensors="pt")
|
33 |
with torch.no_grad():
|
34 |
outputs = model(**inputs).logits
|
35 |
ids = torch.argmax(outputs, dim=-1)[0]
|
36 |
transcription = processor.decode(ids)
|
37 |
return transcription
|
38 |
|
39 |
+
def transcribe(input): #Gradio UI wrapper function
|
40 |
+
audioarray = preprocess(input) #Call preprocessor function
|
41 |
+
out = run(audioarray)
|
42 |
+
return out
|
43 |
|
44 |
with gr.Blocks(theme = gr.themes.Soft()) as demo:
|
45 |
gr.HTML(
|
|
|
63 |
</div></h6>
|
64 |
""")
|
65 |
|
66 |
+
tts = TTSModel(LANG)
|
67 |
|
68 |
def fn2(input):
|
69 |
res = tts.synthesize(input)
|
|
|
82 |
""")
|
83 |
with gr.Column(scale = 4):
|
84 |
with gr.Tab("Rolou kumaa ginarit"):
|
|
|
85 |
input = gr.components.Audio(source = "microphone", label = "Gakamai rolou nu")
|
86 |
output = gr.components.Textbox(label = "Dalinsuat")
|
87 |
button1 = gr.Button("Dalinsuato' | Transcribe")
|
88 |
+
button1.click(transcribe, inputs = input, outputs = output)
|
89 |
|
90 |
with gr.Tab("Ginarit kumaa rolou"):
|
91 |
input = gr.components.Textbox(label = "Ginarit", placeholder = "Potutakai suat nu hiti")
|