Spaces:
Runtime error
Runtime error
File size: 4,482 Bytes
147fb27 316ede8 147fb27 316ede8 147fb27 316ede8 147fb27 316ede8 147fb27 316ede8 147fb27 316ede8 147fb27 316ede8 147fb27 316ede8 147fb27 316ede8 147fb27 316ede8 147fb27 316ede8 147fb27 316ede8 147fb27 316ede8 44f6e5d 147fb27 44f6e5d 147fb27 316ede8 147fb27 316ede8 44f6e5d 316ede8 44f6e5d 316ede8 147fb27 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
# -*- coding: utf-8 -*-
"""app.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/143eWt9oxUTcF59OBiVybOgKXJB3QOTsK
"""
# Beginning of Unit 7
from transformers.models.markuplm.tokenization_markuplm import MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
import torch, torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import sentencepiece
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from IPython.display import Audio
import numpy as np
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max
# Load Spanish Audio
def transcribe(audio):
model_id_asr = "openai/whisper-small"
processor_asr = WhisperProcessor.from_pretrained(model_id_asr)
model_asr = WhisperForConditionalGeneration.from_pretrained(model_id_asr)
model_asr.config.forced_decoder_ids = None
input_features = processor_asr(audio["audio"]["array"], sampling_rate=audio["audio"]["sampling_rate"], return_tensors="pt").input_features
predicted_ids = model_asr.generate(input_features)
# decode token ids to text
transcription = processor_asr.batch_decode(predicted_ids, skip_special_tokens=True)
return transcription[0]
# Run inference on Spanish Audio vector
def translate(text):
model_id_mt = "Helsinki-NLP/opus-mt-es-fr"
tokenizer_mt = MarianTokenizer.from_pretrained(model_id_mt)
model_mt = MarianMTModel.from_pretrained(model_id_mt)
# Tokenize the input text
input_ids = tokenizer_mt.encode(text, return_tensors="pt")
# Generate translation
with torch.no_grad():
translated_ids = model_mt.generate(input_ids)
# Decode the translated text
translated_text = tokenizer_mt.decode(translated_ids[0], skip_special_tokens=True)
return translated_text
def synthesise(text):
processor_tts = SpeechT5Processor.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")
model_tts = SpeechT5ForTextToSpeech.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
inputs = processor_tts(text=text, return_tensors="pt")
speech = model_tts.generate_speech(
inputs["input_ids"], speaker_embeddings, vocoder=vocoder
)
return speech
def speech_to_speech_translation(audio):
transcribed_text = transcribe(audio)
translated_text = translate(transcribed_text)
synthesised_speech = synthesise(translated_text)
return 16000, synthesised_speech
def speech_to_speech_translation(audio_filepath):
# Load the audio file
waveform, sampling_rate = torchaudio.load(audio_filepath)
if sampling_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
waveform = resampler(waveform)
sampling_rate = 16000
# Convert the waveform to a numpy array and construct the expected dictionary format
audio_dict = {
"audio": {
"array": waveform.numpy(),
"sampling_rate": sampling_rate
}
}
transcribed_text = transcribe(audio_dict)
translated_text = translate(transcribed_text)
synthesised_speech = synthesise(translated_text)
#print(transcribed_text)
#print(translated_text)
#print(synthesised_speech)
#print(torch.min(synthesised_speech), torch.max(synthesised_speech))
synthesised_speech = (synthesised_speech * 32767).numpy().astype(np.int16)
#print(synthesised_speech)
#print(np.min(synthesised_speech), np.max(synthesised_speech))
return 16000, synthesised_speech
import gradio as gr
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch(debug=True, share=False) |