File size: 4,482 Bytes
147fb27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316ede8
147fb27
 
 
 
 
 
 
 
316ede8
147fb27
 
 
 
 
316ede8
147fb27
316ede8
147fb27
316ede8
147fb27
 
 
316ede8
147fb27
316ede8
147fb27
 
 
 
 
 
316ede8
147fb27
 
 
316ede8
147fb27
 
316ede8
147fb27
316ede8
 
 
 
147fb27
 
 
 
 
 
 
 
 
 
 
 
 
316ede8
 
147fb27
 
316ede8
 
 
44f6e5d
147fb27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44f6e5d
147fb27
 
 
 
 
 
 
 
316ede8
147fb27
316ede8
 
 
 
44f6e5d
316ede8
 
 
 
 
44f6e5d
316ede8
 
 
 
 
 
 
147fb27
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# -*- coding: utf-8 -*-
"""app.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/143eWt9oxUTcF59OBiVybOgKXJB3QOTsK
"""

# Beginning of Unit 7

from transformers.models.markuplm.tokenization_markuplm import MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
import torch, torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import sentencepiece
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from IPython.display import Audio
import numpy as np

target_dtype = np.int16
max_range = np.iinfo(target_dtype).max

# Load Spanish Audio

def transcribe(audio):
    model_id_asr = "openai/whisper-small"
    processor_asr = WhisperProcessor.from_pretrained(model_id_asr)
    model_asr = WhisperForConditionalGeneration.from_pretrained(model_id_asr)
    model_asr.config.forced_decoder_ids = None

    input_features = processor_asr(audio["audio"]["array"], sampling_rate=audio["audio"]["sampling_rate"], return_tensors="pt").input_features

    predicted_ids = model_asr.generate(input_features)

    # decode token ids to text
    transcription = processor_asr.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

# Run inference on Spanish Audio vector

def translate(text):
    model_id_mt = "Helsinki-NLP/opus-mt-es-fr"
    tokenizer_mt = MarianTokenizer.from_pretrained(model_id_mt)
    model_mt = MarianMTModel.from_pretrained(model_id_mt)
    # Tokenize the input text
    input_ids = tokenizer_mt.encode(text, return_tensors="pt")

    # Generate translation
    with torch.no_grad():
        translated_ids = model_mt.generate(input_ids)

    # Decode the translated text
    translated_text = tokenizer_mt.decode(translated_ids[0], skip_special_tokens=True)

    return translated_text


def synthesise(text):

    processor_tts = SpeechT5Processor.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")

    model_tts = SpeechT5ForTextToSpeech.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    inputs = processor_tts(text=text, return_tensors="pt")
    speech = model_tts.generate_speech(
        inputs["input_ids"], speaker_embeddings, vocoder=vocoder
    )
    return speech

def speech_to_speech_translation(audio):
    transcribed_text = transcribe(audio)
    translated_text = translate(transcribed_text)
    synthesised_speech = synthesise(translated_text)
    return 16000, synthesised_speech

def speech_to_speech_translation(audio_filepath):
    # Load the audio file
    waveform, sampling_rate = torchaudio.load(audio_filepath)

    if sampling_rate != 16000:
      resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
      waveform = resampler(waveform)
      sampling_rate = 16000
    # Convert the waveform to a numpy array and construct the expected dictionary format
    audio_dict = {
        "audio": {
            "array": waveform.numpy(),
            "sampling_rate": sampling_rate
        }
    }
    transcribed_text = transcribe(audio_dict)
    translated_text = translate(transcribed_text)
    synthesised_speech = synthesise(translated_text)
    #print(transcribed_text)
    #print(translated_text)
    #print(synthesised_speech)
    #print(torch.min(synthesised_speech), torch.max(synthesised_speech))
    synthesised_speech = (synthesised_speech * 32767).numpy().astype(np.int16)
    #print(synthesised_speech)
    #print(np.min(synthesised_speech), np.max(synthesised_speech))
    return 16000, synthesised_speech

import gradio as gr

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch(debug=True, share=False)