File size: 5,291 Bytes
d347764
 
 
08375d8
ea70de0
d347764
08375d8
d347764
 
08375d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d347764
 
 
ea70de0
 
 
 
 
 
 
 
 
 
 
 
 
f9ef521
08375d8
d347764
08375d8
 
 
f9ef521
08375d8
d347764
 
08375d8
d347764
08375d8
d347764
08375d8
 
 
 
417f0b6
08375d8
417f0b6
08375d8
 
d347764
08375d8
417f0b6
 
 
 
08375d8
ea70de0
 
08375d8
d347764
 
 
 
ea70de0
 
 
 
 
d347764
f805e49
 
08375d8
f805e49
 
 
 
c737803
 
 
d347764
08375d8
d347764
f805e49
 
d347764
c737803
 
 
08375d8
c737803
 
 
 
 
 
 
3946ba6
c737803
d347764
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
import numpy as np
import torch
from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor

device = "cuda:0" if torch.cuda.is_available() else "cpu"


translation_models = {
    "en": "Helsinki-NLP/opus-mt-en-es",  # Inglés a Español
    "fr": "Helsinki-NLP/opus-mt-fr-es",  # Francés a Español
    "de": "Helsinki-NLP/opus-mt-de-es",  # Alemán a Español
    "it": "Helsinki-NLP/opus-mt-it-es",  # Italiano a Español
    "pt": "Helsinki-NLP/opus-mt-pt-es",  # Portugués a Español
    "nl": "Helsinki-NLP/opus-mt-nl-es",  # Neerlandés (Holandés) a Español
    "fi": "Helsinki-NLP/opus-mt-fi-es",  # Finés a Español
    "sv": "Helsinki-NLP/opus-mt-sv-es",  # Sueco a Español
    "da": "Helsinki-NLP/opus-mt-da-es",  # Danés a Español
    "no": "Helsinki-NLP/opus-mt-no-es",  # Noruego a Español
    "ru": "Helsinki-NLP/opus-mt-ru-es",  # Ruso a Español
    "pl": "Helsinki-NLP/opus-mt-pl-es",  # Polaco a Español
    "cs": "Helsinki-NLP/opus-mt-cs-es",  # Checo a Español
    "tr": "Helsinki-NLP/opus-mt-tr-es",  # Turco a Español
    "zh": "Helsinki-NLP/opus-mt-zh-es",  # Chino a Español
    "ja": "Helsinki-NLP/opus-mt-ja-es",  # Japonés a Español
    "ar": "Helsinki-NLP/opus-mt-ar-es",  # Árabe a Español
    "ro": "Helsinki-NLP/opus-mt-ro-es",  # Rumano a Español
    "el": "Helsinki-NLP/opus-mt-el-es",  # Griego a Español
    "bg": "Helsinki-NLP/opus-mt-bg-es",  # Búlgaro a Español
    "uk": "Helsinki-NLP/opus-mt-uk-es",  # Ucraniano a Español
    "he": "Helsinki-NLP/opus-mt-he-es",  # Hebreo a Español
    "lt": "Helsinki-NLP/opus-mt-lt-es",  # Lituano a Español
    "et": "Helsinki-NLP/opus-mt-et-es",  # Estonio a Español
    "hr": "Helsinki-NLP/opus-mt-hr-es",  # Croata a Español
    "hu": "Helsinki-NLP/opus-mt-hu-es",  # Húngaro a Español
    "lv": "Helsinki-NLP/opus-mt-lv-es",  # Letón a Español
    "sl": "Helsinki-NLP/opus-mt-sl-es",  # Esloveno a Español
    "sk": "Helsinki-NLP/opus-mt-sk-es",  # Eslovaco a Español
    "sr": "Helsinki-NLP/opus-mt-sr-es",  # Serbio a Español
    "fa": "Helsinki-NLP/opus-mt-fa-es",  # Persa a Español
}

asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

#vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
#vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")


model = SpeechT5ForTextToSpeech.from_pretrained(
    "juangtzi/speecht5_finetuned_voxpopuli_es"
)
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

speaker_embeddings2 = np.load('speaker_embeddings.npy')
speaker_embeddings2 = torch.tensor(speaker_embeddings2)
print(speaker_embeddings2)
lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")

def language_detector(text):
    resultado = lang_detector(text)
    idioma_detectado = resultado[0]['label']
    print(idioma_detectado)
    return idioma_detectado

def translate(audio):
    transcribe = asr_pipe(audio, max_new_tokens=256)

    codigo_idioma = language_detector(transcribe['text'])

    if codigo_idioma in translation_models:
        translator = pipeline("translation", model=translation_models[codigo_idioma])
        traduccion = translator(transcribe['text'])
    else:
        transcribe = transcribe['text']
        print(f"No hay un modelo de traducción disponible para el idioma detectado {codigo_idioma}")
        return transcribe
    
    return traduccion

def synthesise(text):
    if isinstance(text, list):
        text = text[0]['translation_text']
    else:
        text = text
    print(text)
    inputs = processor(text, return_tensors="pt")
    output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
    return output

def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    audio_data = synthesised_speech.cpu().numpy()
    audio_data = np.squeeze(audio_data)
    audio_data = audio_data / np.max(np.abs(audio_data))
    sample_rate = 16000
    return (sample_rate, audio_data)

title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Spanish.

![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch()