anderbogia commited on
Commit
040ebdb
·
1 Parent(s): be60470

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -17
app.py CHANGED
@@ -1,10 +1,8 @@
1
  import os
2
- os.system("pip install numpy==1.18.5") #NumPy 1.24 or less needed by Numba. Use 1.23, librosa still uses np.complex which was dropped in NumPy 1.24
3
  os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
4
  os.system("pip install torch accelerate torchaudio datasets librosa easymms")
5
 
6
- #Transformers have a bug somewhere that conflicts with Numpy v1.19.0 and above.
7
-
8
  import gradio as gr
9
  from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor
10
  from datasets import load_dataset, Audio, Dataset
@@ -12,14 +10,14 @@ import torch
12
  import librosa #For converting audio sample rate to 16k
13
  from easymms.models.tts import TTSModel #For TTS inference using EasyMMS
14
 
15
-
16
  model_id = "facebook/mms-1b-all"
17
 
18
  #Set target language to dtp (Kadazandusun)
19
  processor = AutoProcessor.from_pretrained(model_id)
20
  model = Wav2Vec2ForCTC.from_pretrained(model_id).to("cpu")
21
- processor.tokenizer.set_target_lang("dtp") #Change dtp to tih for Timugon Murut or iba for Iban
22
- model.load_adapter("dtp")
23
 
24
  asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"
25
 
@@ -30,21 +28,18 @@ def preprocess(input): #Sets recording sampling rate to 16k and returns numpy nd
30
  audio_to_array = loaded_audio[0]["audio"]["array"]
31
  return audio_to_array
32
 
33
- def transcribe(input): #Gradio UI wrapper function
34
- audioarray = preprocess(input) #Call preprocessor function
35
- out = run(audioarray)
36
- return out
37
- #transcription = asr_pipeline(audioarray)
38
- #return transcription["text"]
39
-
40
  def run(input):
41
- inputs = processor(input, sampling_rate=16_000, return_tensors="pt")#.to("cuda")
42
  with torch.no_grad():
43
  outputs = model(**inputs).logits
44
  ids = torch.argmax(outputs, dim=-1)[0]
45
  transcription = processor.decode(ids)
46
  return transcription
47
 
 
 
 
 
48
 
49
  with gr.Blocks(theme = gr.themes.Soft()) as demo:
50
  gr.HTML(
@@ -68,7 +63,7 @@ with gr.Blocks(theme = gr.themes.Soft()) as demo:
68
  </div></h6>
69
  """)
70
 
71
- tts = TTSModel('dtp')
72
 
73
  def fn2(input):
74
  res = tts.synthesize(input)
@@ -87,11 +82,10 @@ with gr.Blocks(theme = gr.themes.Soft()) as demo:
87
  """)
88
  with gr.Column(scale = 4):
89
  with gr.Tab("Rolou kumaa ginarit"):
90
- #input = gr.components.Textbox(placeholder = "Potutakai suat nu hiti | Type something here")
91
  input = gr.components.Audio(source = "microphone", label = "Gakamai rolou nu")
92
  output = gr.components.Textbox(label = "Dalinsuat")
93
  button1 = gr.Button("Dalinsuato' | Transcribe")
94
- button1.click(run, inputs = input, outputs = output)
95
 
96
  with gr.Tab("Ginarit kumaa rolou"):
97
  input = gr.components.Textbox(label = "Ginarit", placeholder = "Potutakai suat nu hiti")
 
1
  import os
2
+ os.system("pip install numpy==1.23.0") #NumPy 1.24 or less needed by Numba. Use 1.23, librosa still uses np.complex which was dropped in NumPy 1.24
3
  os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]")
4
  os.system("pip install torch accelerate torchaudio datasets librosa easymms")
5
 
 
 
6
  import gradio as gr
7
  from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor
8
  from datasets import load_dataset, Audio, Dataset
 
10
  import librosa #For converting audio sample rate to 16k
11
  from easymms.models.tts import TTSModel #For TTS inference using EasyMMS
12
 
13
+ LANG = "dtp"
14
  model_id = "facebook/mms-1b-all"
15
 
16
  #Set target language to dtp (Kadazandusun)
17
  processor = AutoProcessor.from_pretrained(model_id)
18
  model = Wav2Vec2ForCTC.from_pretrained(model_id).to("cpu")
19
+ processor.tokenizer.set_target_lang(LANG)
20
+ model.load_adapter(LANG)
21
 
22
  asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"
23
 
 
28
  audio_to_array = loaded_audio[0]["audio"]["array"]
29
  return audio_to_array
30
 
 
 
 
 
 
 
 
31
  def run(input):
32
+ inputs = processor(input, sampling_rate=16_000, return_tensors="pt")
33
  with torch.no_grad():
34
  outputs = model(**inputs).logits
35
  ids = torch.argmax(outputs, dim=-1)[0]
36
  transcription = processor.decode(ids)
37
  return transcription
38
 
39
+ def transcribe(input): #Gradio UI wrapper function
40
+ audioarray = preprocess(input) #Call preprocessor function
41
+ out = run(audioarray)
42
+ return out
43
 
44
  with gr.Blocks(theme = gr.themes.Soft()) as demo:
45
  gr.HTML(
 
63
  </div></h6>
64
  """)
65
 
66
+ tts = TTSModel(LANG)
67
 
68
  def fn2(input):
69
  res = tts.synthesize(input)
 
82
  """)
83
  with gr.Column(scale = 4):
84
  with gr.Tab("Rolou kumaa ginarit"):
 
85
  input = gr.components.Audio(source = "microphone", label = "Gakamai rolou nu")
86
  output = gr.components.Textbox(label = "Dalinsuat")
87
  button1 = gr.Button("Dalinsuato' | Transcribe")
88
+ button1.click(transcribe, inputs = input, outputs = output)
89
 
90
  with gr.Tab("Ginarit kumaa rolou"):
91
  input = gr.components.Textbox(label = "Ginarit", placeholder = "Potutakai suat nu hiti")