TheFriendlyNPC commited on
Commit
8a7cc65
1 Parent(s): eaa338f
Files changed (2) hide show
  1. app.py +78 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import time
4
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
5
+ from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
6
+ import torch
7
+ from fairseq.utils import move_to_cuda
8
+
9
+ """
10
+ Notes:
11
+ pip install sentencepiece
12
+ pip install phonemizer
13
+
14
+ install ffmpeg and add to path
15
+
16
+ Must install e-speak to path
17
+
18
+ solution
19
+ 1.Download and install the Windows version of espeak: http://espeak.sourceforge.net/download.html
20
+
21
+ 2. set PATH=%PATH%;"C:\Program Files (x86)\eSpeak\command_line"_
22
+
23
+ 3. Install .msi from https://github.com/espeak-ng/espeak-ng/releases
24
+
25
+ 4.Enter environment variable
26
+
27
+ 1.PHONEMIZER_ESPEAK_LIBRARY="c:\Program Files\eSpeak NG\libespeak-ng.dll"
28
+ 2.PHONEMIZER_ESPEAK_PATH =“c:\Program Files\eSpeak NG”
29
+
30
+ and Restart your Computer. Run the same command again from the Command Prompt (cmd.exe):
31
+
32
+
33
+ """
34
+ asr = pipeline("automatic-speech-recognition",model="facebook/wav2vec2-base-960h" )
35
+ translation_pipeline = pipeline('translation_en_to_es',model = "Helsinki-NLP/opus-mt-en-es" ) #This model version is built for en- to -fr , less mistakes
36
+
37
+ models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
38
+ "facebook/tts_transformer-es-css10",
39
+ arg_overrides={"vocoder": "hifigan", "fp16": False}
40
+ )
41
+
42
+ model = models[0]
43
+ model = model.to(torch.device("cuda:0"))
44
+
45
+ TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
46
+ generator = task.build_generator(models, cfg)
47
+
48
+
49
+ def transcribe_translate(audio):
50
+ time.sleep(3)
51
+ text_en = asr(audio)["text"]
52
+
53
+ text_fr = translation_pipeline(text_en.lower()) # for some reason all audio converted to all caps and it translates differently???
54
+ text_fr = text_fr[0]['translation_text'] # good evening = bonsoir but GOOD EVENING = BONNES SÉANCES . WEIRD
55
+
56
+ sample = TTSHubInterface.get_model_input(task, text_fr)
57
+ sample = move_to_cuda(sample) if torch.cuda.is_available() else sample
58
+
59
+ wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
60
+ wav = wav.to('cpu')
61
+ wav = wav.numpy()
62
+ print(wav.dtype)
63
+
64
+ return text_en,text_fr , (rate,wav)
65
+
66
+
67
+ gr.Interface(
68
+ fn=transcribe_translate,
69
+ inputs=[
70
+ gr.Audio(source="microphone", type="filepath")
71
+ ],
72
+ outputs=[
73
+ gr.Textbox(label= "English Transcription"),
74
+ gr.Textbox(label= "Spanish Translation"),
75
+ gr.Audio(label = "Spanish Audio")
76
+ ],
77
+ live=True).launch(share=True)
78
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ fairseq
3
+ torch
4
+ sentencepiece
5
+ phonemizer