Spaces:
Runtime error
Runtime error
File size: 2,805 Bytes
3038346 eb57fd9 3038346 eb57fd9 3038346 293ecc9 3038346 293ecc9 3038346 d347764 3038346 d347764 3038346 d347764 3038346 d347764 293ecc9 3038346 d347764 3038346 d347764 293ecc9 3038346 d347764 3038346 d347764 3038346 f805e49 c737803 3038346 c737803 d347764 226ec3a d347764 c737803 3946ba6 c737803 b2742fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# -*- coding: utf-8 -*-
"""Built_Speech-to-Speech_Translation.ipynb"""
# Automatically generated by Colaboratory.
# Original file is located at
# https://colab.research.google.com/drive/1AHToRlVpGAy3jQdbTm14tDdTyRPc-oG3
"""Speech Translation to Text Part"""
from huggingface_hub import login
login("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")
import torch
from transformers import pipeline
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"automatic-speech-recognition", model="openai/whisper-base", device=device
)
from datasets import load_dataset
dataset = load_dataset("facebook/voxpopuli", "nl", split="validation", streaming=True)
def translate(audio):
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
return outputs["text"]
"""Text-to-Speech Part"""
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load Speakers Embedding
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
def synthesize(text):
inputs = processor(text=text, return_tensors="pt")
speech = model.generate_speech(
inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
)
return speech.cpu()
"""Creating Speech-to-Speech Translation (STST) Demo"""
import numpy as np
# Normalized Audio array by the dynamic range of the target dtype (int16)
# Next convert from the default NumPy dtype (float64) to the target dtype (int16)
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesized_speech = synthesize(translated_text)
synthesized_speech = (synthesized_speech.numpy() * max_range).astype(np.int16)
return 16000, synthesized_speech
import gradio as gr
demo = gr.Blocks()
description = "Speech-to-Speech Translation En->Nl"
title = "Building Demo for Audio Course"
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch(share=False, debug=False) |