File size: 4,068 Bytes
d347764
 
 
4f7c91e
e1c6d89
d7760f2
e1c6d89
d347764
08375d8
d347764
d7760f2
8af36ac
 
d7760f2
 
 
 
 
 
 
d347764
c778606
 
 
e1c6d89
 
ea70de0
c778606
ea70de0
5e47240
 
 
 
 
 
 
 
 
 
c778606
d347764
e1c6d89
 
 
 
 
d347764
 
b7cd514
09708e9
99b08f3
d347764
e1c6d89
 
 
 
 
 
 
 
 
 
 
 
 
 
f75821a
e1c6d89
 
 
 
f75821a
c778606
f75821a
 
 
e1c6d89
 
d347764
f805e49
 
08375d8
f805e49
 
c737803
 
 
d347764
08375d8
d347764
f805e49
 
d347764
c737803
 
 
08375d8
c737803
 
 
 
 
 
40aa950
 
c737803
40aa950
 
 
 
 
 
 
 
3946ba6
c737803
40aa950
 
 
 
d347764
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
import numpy as np
import torch
from transformers import pipeline, VitsModel
#from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
from transformers import WhisperTokenizer, GenerationConfig
from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"


tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium")
generation_config = GenerationConfig.from_pretrained("openai/whisper-medium")

generation_config.forced_decoder_ids

tokenizer.decode(generation_config.forced_decoder_ids[1][1])


asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)


# ---------------- Speech generator mms-tts-spa --------------------------#

vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")

# ---------------- Speech generator  specht5_tts --------------------------#

# model = SpeechT5ForTextToSpeech.from_pretrained(
#     "juangtzi/speecht5_finetuned_voxpopuli_es"
# )
# checkpoint = "microsoft/speecht5_tts"
# processor = SpeechT5Processor.from_pretrained(checkpoint)
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# speaker_embeddings2 = np.load('speaker_embeddings.npy')
# speaker_embeddings2 = torch.tensor(speaker_embeddings2)
# print(speaker_embeddings2)


# def language_detector(text):
#     resultado = lang_detector(text)
#     idioma_detectado = resultado[0]['label']
#     print(idioma_detectado)
#     return idioma_detectado

def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
    print(outputs["text"])
    return outputs["text"]

# def synthesise(text):
#     inputs = processor(text=text, return_tensors="pt")
#     output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
#     return output

# def speech_to_speech_translation(audio):
#     translated_text = translate(audio)
#     synthesised_speech = synthesise(translated_text)
#     audio_data = synthesised_speech.cpu().numpy()
#     #audio_data = np.squeeze(audio_data)
#     #audio_data = audio_data / np.max(np.abs(audio_data))
#     sample_rate = 16000
#     return (sample_rate, audio_data)

def synthesise(text):
    print(text)
    inputs = vist_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = vist_model(**inputs).waveform[0]
    return output

def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech

title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Spanish.
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)


# Definir la estructura dentro de gr.Blocks()
with demo:
    # Mostrar el título y la descripción
    gr.Markdown(f"# {title}")
    gr.Markdown(description)
    
    # Incluir la imagen
    gr.Image("https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png")
    
    # Tabbed Interface para las dos modalidades
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])


# with demo:
#     gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch()