speech-to-speech-translation_en-nl

Runtime error

App Files Files Community

Bolakubus commited on Oct 6, 2023

Commit

293ecc9

•

1 Parent(s): 3038346

Upload app.py

Browse files

Files changed (1) hide show

app.py +5 -47

app.py CHANGED Viewed

@@ -6,20 +6,10 @@ Automatically generated by Colaboratory.
 Original file is located at
     https://colab.research.google.com/drive/1AHToRlVpGAy3jQdbTm14tDdTyRPc-oG3
 """
-! pip install git+https://github.com/huggingface/transformers.git
-! pip install torch
-! pip install --upgrade accelerate
-! pip install datasets soundfile speechbrain
-"""### Speech Translation to Text"""
-from huggingface_hub import notebook_login
-notebook_login()
 import torch
 from transformers import pipeline
@@ -30,51 +20,27 @@ pipe = pipeline(
 )
 from datasets import load_dataset
 dataset = load_dataset("facebook/voxpopuli", "nl", split="validation", streaming=True)
 sample = next(iter(dataset))
 from IPython.display import Audio
 Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])
-# Function to generate task argument "translate" for speech translation
-# Recall that "transcribe" task for Speech Recognition
 def translate(audio):
     outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
     return outputs["text"]
-"""Whisper can also be ‘tricked’ into translating from speech in any language X to any language Y. Simply set the task to "transcribe" and the "language" to your target language in the generation key-word arguments, e.g. for Spanish, one would set:
-generate_kwargs={"task": "transcribe", "language": "es"}
-"""
-# See the translation result
-translate(sample["audio"].copy())
-# Compare to raw text
-sample["raw_text"]
-"""### Text-to-Speech"""
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-"""Here we're using SpeechT5 checkpoint trained specifically for Dutch TTS from Bolakubus/speecht5_finetuned_voxpopuli_nl . Should you wish to translate into a language other than Dutch, either swap the checkpoint for a SpeechT5 TTS model fine-tuned on your language of choice, or use an MMS TTS checkpoint pre-trained in your target language."""
-# Put the model and vocoder to GPU accelerator device if we have one
-model.to(device)
-vocoder.to(device)
 # Load Speakers Embedding
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-"""We can now write a function that takes a text prompt as input, and generates the corresponding speech. We’ll first pre-process the text input using the SpeechT5 processor, tokenizing the text to get our input ids. We’ll then pass the input ids and speaker embeddings to the SpeechT5 model, placing each on the accelerator device if available. Finally, we’ll return the generated speech, bringing it back to the CPU so that we can play it back in our ipynb notebook:"""
 def synthesize(text):
     inputs = processor(text=text, return_tensors="pt")
     speech = model.generate_speech(
@@ -82,13 +48,7 @@ def synthesize(text):
     )
     return speech.cpu()
-# Dummy Check
-speech = synthesize("This is a test")
-Audio(speech, rate=16000)
-"""### Creating Speech-to-Speech Translation (STST) Demo"""
 import numpy as np
 # Normalized Audio array by the dynamic range of the target dtype (int16)
@@ -106,8 +66,6 @@ sampling_rate, synthesized_speech = speech_to_speech_translation(sample["audio"]
 Audio(synthesized_speech, rate=sampling_rate)
-! pip install gradio
 import gradio as gr
 from gradio.mix import Series

 Original file is located at
     https://colab.research.google.com/drive/1AHToRlVpGAy3jQdbTm14tDdTyRPc-oG3
 """
+"""Speech Translation to Text Part"""
+from huggingface_hub import login
+login("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")
 import torch
 from transformers import pipeline
 )
 from datasets import load_dataset
 dataset = load_dataset("facebook/voxpopuli", "nl", split="validation", streaming=True)
 sample = next(iter(dataset))
 from IPython.display import Audio
 Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])
 def translate(audio):
     outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
     return outputs["text"]
+"""Text-to-Speech Part"""
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 # Load Speakers Embedding
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 def synthesize(text):
     inputs = processor(text=text, return_tensors="pt")
     speech = model.generate_speech(
     )
     return speech.cpu()
+"""Creating Speech-to-Speech Translation (STST) Demo"""
 import numpy as np
 # Normalized Audio array by the dynamic range of the target dtype (int16)
 Audio(synthesized_speech, rate=sampling_rate)
 import gradio as gr
 from gradio.mix import Series