File size: 2,805 Bytes
3038346
eb57fd9
3038346
eb57fd9
 
 
 
3038346
293ecc9
3038346
293ecc9
 
3038346
 
 
d347764
 
3038346
 
 
d347764
3038346
 
d347764
 
3038346
d347764
 
293ecc9
3038346
 
 
 
 
 
 
 
 
 
 
d347764
3038346
 
 
d347764
 
293ecc9
3038346
 
 
 
 
 
d347764
 
 
3038346
 
 
d347764
3038346
f805e49
c737803
3038346
 
c737803
 
d347764
226ec3a
d347764
 
c737803
 
 
 
 
 
 
 
3946ba6
c737803
b2742fe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- coding: utf-8 -*-
"""Built_Speech-to-Speech_Translation.ipynb"""

# Automatically generated by Colaboratory.

# Original file is located at
#     https://colab.research.google.com/drive/1AHToRlVpGAy3jQdbTm14tDdTyRPc-oG3

"""Speech Translation to Text Part"""

from huggingface_hub import login
login("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")

import torch
from transformers import pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base", device=device
)

from datasets import load_dataset
dataset = load_dataset("facebook/voxpopuli", "nl", split="validation", streaming=True)

def translate(audio):
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
    return outputs["text"]

"""Text-to-Speech Part"""

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Load Speakers Embedding
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

def synthesize(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(
        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
    )
    return speech.cpu()

"""Creating Speech-to-Speech Translation (STST) Demo"""
import numpy as np

# Normalized Audio array by the dynamic range of the target dtype (int16)
# Next convert from the default NumPy dtype (float64) to the target dtype (int16)
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max

def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesized_speech = synthesize(translated_text)
    synthesized_speech = (synthesized_speech.numpy() * max_range).astype(np.int16)
    return 16000, synthesized_speech

import gradio as gr

demo = gr.Blocks()
description = "Speech-to-Speech Translation En->Nl"
title = "Building Demo for Audio Course"

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch(share=False, debug=False)