import torch from transformers import pipeline from datasets import load_dataset from transformers import AutoModel from transformers import pipeline, SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech import numpy as np import gradio as gr # Configurar el pipeline de reconocimiento automático de voz pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base") # Load model directly # Función para traducir texto def translate(audio): outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"}) return outputs["text"] # Cargar el procesador y el modelo de SpeechT5 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = AutoModel.from_pretrained("gitgato/mabama") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Cargar los datos de embeddings del hablante embeddings_dataset = load_dataset("ovieyra21/mabama-v5", split="train") speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) # Función para sintetizar el habla def synthesise(text): inputs = processor(text=text, return_tensors="pt") speech = model.generate(inputs["input_ids"], speaker_embedding=speaker_embeddings, vocoder=vocoder) return speech.numpy() # Configuración para el tipo de audio de salida target_dtype = np.int16 max_range = np.iinfo(target_dtype).max # Función para traducción de habla a habla def speech_to_speech_translation(audio): translated_text = translate(audio) synthesised_speech = synthesise(translated_text) synthesised_speech = (synthesised_speech * max_range).astype(np.int16) return 16000, synthesised_speech # Interfaz de Gradio demo = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(sources=["microphone"], type="file", label="Input Audio"), outputs=gr.Audio(label="Generated Speech", type="numpy"), title="Speech-to-Speech Translation", description="Translate speech input to synthesized speech output." ) # Lanzar la interfaz demo.launch(debug=True)