Spaces:
Runtime error
Runtime error
Upload 9 files
Browse files- app.py +70 -0
- example1.mp3 +0 -0
- example2.mp3 +0 -0
- example3.mp3 +0 -0
- example4.mp3 +0 -0
- goai_stt.py +60 -0
- goai_traduction.py +21 -0
- goai_tts.py +41 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import scipy
|
3 |
+
import gradio as gr
|
4 |
+
from transformers import set_seed
|
5 |
+
from datasets import load_dataset, Audio
|
6 |
+
|
7 |
+
import goai_stt, goai_tts, goai_traduction
|
8 |
+
|
9 |
+
#language_list = ['mos', 'fra', 'eng']
|
10 |
+
device = 0 if torch.cuda.is_available() else "cpu"
|
11 |
+
|
12 |
+
demo = gr.Blocks()
|
13 |
+
|
14 |
+
goai_stt = gr.Interface(
|
15 |
+
fn = goai_stt.goai_stt,
|
16 |
+
inputs=[
|
17 |
+
gr.Audio(sources=["microphone", "upload"], type="numpy")
|
18 |
+
],
|
19 |
+
outputs="text",
|
20 |
+
examples=[["./example1.mp3", "a ye ligdi"],
|
21 |
+
["./example2.mp3", "zoe nimbãanega"],
|
22 |
+
["./example3.mp3", "zãng-zãnga"],
|
23 |
+
["./example4.mp3", "yõk foto"]
|
24 |
+
],
|
25 |
+
title="Transcription Mooré: audio vers texte",
|
26 |
+
description="Démo de transcription de la parole vers le texte en langage Mooré. Enregistrez l'audio à partir de votre micro ou uploadez-le depuis votre appareil!",
|
27 |
+
)
|
28 |
+
|
29 |
+
goai_tts = gr.Interface(
|
30 |
+
fn=goai_tts.goai_tts,
|
31 |
+
inputs=[
|
32 |
+
gr.Text(label="Input text")
|
33 |
+
],
|
34 |
+
outputs=[
|
35 |
+
gr.Audio(label="Generated Audio", type="numpy")
|
36 |
+
],
|
37 |
+
examples=[["a ye ligdi"],
|
38 |
+
["zoe nimbãanega "],
|
39 |
+
["zãng-zãnga"],
|
40 |
+
["yõk foto"]
|
41 |
+
],
|
42 |
+
title="Synthèse vocale Mooré: texte vers audio",
|
43 |
+
description="Démo de synthèse vocale d'un texte en langage Mooré!",
|
44 |
+
)
|
45 |
+
|
46 |
+
goai_traduction = gr.Interface(
|
47 |
+
fn=goai_traduction.goai_traduction,
|
48 |
+
inputs=[
|
49 |
+
gr.Textbox(label="Text", placeholder="Yaa sõama"),
|
50 |
+
gr.Dropdown(label="Source Language", choices=["eng_Latn", "fra_Latn", "mos_Latn"]),
|
51 |
+
gr.Dropdown(label="Target Language", choices=["eng_Latn", "fra_Latn", "mos_Latn"])
|
52 |
+
],
|
53 |
+
outputs=["text"],
|
54 |
+
examples=[["Yʋʋm a wãn la b kẽesd biig lekolle?", "mos_Latn", "fra_Latn"],
|
55 |
+
["Zak-soab la kasma.", "mos_Latn", "fra_Latn"],
|
56 |
+
["Le gouvernement avait pris des mesures louables par rapport à l’augmentation des prix de certaines denrées alimentaires.", "fra_Latn", "mos_Latn"],
|
57 |
+
["Comme lors du match face à la Côte d’Ivoire, c’est sur un coup de pied arrêté que les Etalons encaissent leur but.", "fra_Latn", "mos_Latn"],
|
58 |
+
],
|
59 |
+
title="Traduction du Mooré: texte vers texte",
|
60 |
+
description="Démo de traduction d'un texte en langage Mooré à partir de l'anglais ou du francais!",
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
with demo:
|
65 |
+
gr.TabbedInterface(
|
66 |
+
[goai_traduction, goai_tts, goai_stt],
|
67 |
+
["Traduction", "Text-2-speech", "Speech-2-text"],
|
68 |
+
)
|
69 |
+
|
70 |
+
demo.launch()
|
example1.mp3
ADDED
Binary file (11.1 kB). View file
|
|
example2.mp3
ADDED
Binary file (15.7 kB). View file
|
|
example3.mp3
ADDED
Binary file (11.6 kB). View file
|
|
example4.mp3
ADDED
Binary file (9.23 kB). View file
|
|
goai_stt.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import librosa
|
3 |
+
import time
|
4 |
+
from transformers import set_seed, Wav2Vec2ForCTC, AutoProcessor
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
device = 0 if torch.cuda.is_available() else "cpu"
|
8 |
+
|
9 |
+
def goai_stt(fichier):
|
10 |
+
"""
|
11 |
+
Transcrire un fichier audio donné.
|
12 |
+
|
13 |
+
Paramètres
|
14 |
+
----------
|
15 |
+
fichier: str | tuple[int, np.ndarray]
|
16 |
+
Le chemin d'accès au fichier audio ou le tuple contenant le taux d'échantillonnage et les données audio.
|
17 |
+
|
18 |
+
Return
|
19 |
+
----------
|
20 |
+
transcript: str
|
21 |
+
Le texte transcrit.
|
22 |
+
"""
|
23 |
+
|
24 |
+
print("Fichier entré en entréé ---------> ", fichier)
|
25 |
+
|
26 |
+
if fichier is None:
|
27 |
+
raise ValueError("Le fichier audio est manquant.")
|
28 |
+
|
29 |
+
### assurer reproducibilité
|
30 |
+
set_seed(2024)
|
31 |
+
|
32 |
+
start_time = time.time()
|
33 |
+
|
34 |
+
### charger le modèle de transcription
|
35 |
+
model_id = "anyantudre/wav2vec2-large-mms-1b-mos-V1"
|
36 |
+
|
37 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
38 |
+
model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang="mos", ignore_mismatched_sizes=True).to(device)
|
39 |
+
|
40 |
+
if isinstance(fichier, str):
|
41 |
+
### preprocessing de l'audio à partir d'un fichier
|
42 |
+
signal, sampling_rate = librosa.load(fichier, sr=16000)
|
43 |
+
else:
|
44 |
+
### preprocessing de l'audio à partir d'un tableau numpy
|
45 |
+
sampling_rate, signal = fichier
|
46 |
+
|
47 |
+
# Convert the signal to float32
|
48 |
+
signal = signal.astype(np.float32)
|
49 |
+
|
50 |
+
inputs = processor(signal, sampling_rate=16000, return_tensors="pt", padding=True).to(device)
|
51 |
+
|
52 |
+
### faire l'inference
|
53 |
+
with torch.no_grad():
|
54 |
+
outputs = model(**inputs).logits
|
55 |
+
|
56 |
+
pred_ids = torch.argmax(outputs, dim=-1)[0]
|
57 |
+
transcription = processor.decode(pred_ids)
|
58 |
+
|
59 |
+
print("Temps écoulé: ", int(time.time() - start_time), " secondes")
|
60 |
+
return transcription
|
goai_traduction.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import pipeline
|
3 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
4 |
+
|
5 |
+
max_length = 512
|
6 |
+
device = 0 if torch.cuda.is_available() else "cpu"
|
7 |
+
model_id = "ArissBandoss/nllb-200-distilled-600M-finetuned-fr-to-mos-V1"
|
8 |
+
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
10 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
|
11 |
+
|
12 |
+
|
13 |
+
def goai_traduction(text, src_lang, tgt_lang):
|
14 |
+
trans_pipe = pipeline("translation",
|
15 |
+
model=model, tokenizer=tokenizer,
|
16 |
+
src_lang=src_lang, tgt_lang=tgt_lang,
|
17 |
+
max_length=max_length,
|
18 |
+
device=device
|
19 |
+
)
|
20 |
+
|
21 |
+
return trans_pipe(text)[0]["translation_text"]
|
goai_tts.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
import scipy.io.wavfile
|
5 |
+
from transformers import pipeline, set_seed
|
6 |
+
|
7 |
+
device = 0 if torch.cuda.is_available() else "cpu"
|
8 |
+
|
9 |
+
|
10 |
+
def goai_tts(texte):
|
11 |
+
"""
|
12 |
+
Pour un texte donné, donner le speech en Mooré correspondant
|
13 |
+
|
14 |
+
Paramètres
|
15 |
+
----------
|
16 |
+
texte: str
|
17 |
+
Le texte écrit.
|
18 |
+
|
19 |
+
Return
|
20 |
+
------
|
21 |
+
Un tuple contenant le taux d'échantillonnage et les données audio sous forme de tableau numpy.
|
22 |
+
"""
|
23 |
+
|
24 |
+
### assurer la reproductibilité
|
25 |
+
set_seed(2024)
|
26 |
+
|
27 |
+
start_time = time.time()
|
28 |
+
|
29 |
+
### charger le modèle TTS
|
30 |
+
model_id = "anyantudre/mms-tts-mos-V1"
|
31 |
+
synthesiser = pipeline("text-to-speech", model_id, device=device)
|
32 |
+
|
33 |
+
### inférence
|
34 |
+
speech = synthesiser(texte)
|
35 |
+
|
36 |
+
sample_rate = speech["sampling_rate"]
|
37 |
+
audio_data = np.array(speech["audio"][0], dtype=float)
|
38 |
+
|
39 |
+
print("Temps écoulé: ", int(time.time() - start_time), " secondes")
|
40 |
+
|
41 |
+
return sample_rate, audio_data
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets
|
2 |
+
librosa
|
3 |
+
pycountry
|
4 |
+
scipy
|
5 |
+
sentencepiece
|
6 |
+
transformers
|
7 |
+
torch
|
8 |
+
gradio
|