Spaces:
Runtime error
Runtime error
Upload app.py
Browse files
app.py
CHANGED
@@ -6,20 +6,10 @@ Automatically generated by Colaboratory.
|
|
6 |
Original file is located at
|
7 |
https://colab.research.google.com/drive/1AHToRlVpGAy3jQdbTm14tDdTyRPc-oG3
|
8 |
"""
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
! pip install torch
|
13 |
-
|
14 |
-
! pip install --upgrade accelerate
|
15 |
-
|
16 |
-
! pip install datasets soundfile speechbrain
|
17 |
-
|
18 |
-
"""### Speech Translation to Text"""
|
19 |
-
|
20 |
-
from huggingface_hub import notebook_login
|
21 |
-
|
22 |
-
notebook_login()
|
23 |
|
24 |
import torch
|
25 |
from transformers import pipeline
|
@@ -30,51 +20,27 @@ pipe = pipeline(
|
|
30 |
)
|
31 |
|
32 |
from datasets import load_dataset
|
33 |
-
|
34 |
dataset = load_dataset("facebook/voxpopuli", "nl", split="validation", streaming=True)
|
35 |
sample = next(iter(dataset))
|
36 |
|
37 |
from IPython.display import Audio
|
38 |
-
|
39 |
Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])
|
40 |
|
41 |
-
# Function to generate task argument "translate" for speech translation
|
42 |
-
# Recall that "transcribe" task for Speech Recognition
|
43 |
def translate(audio):
|
44 |
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
|
45 |
return outputs["text"]
|
46 |
|
47 |
-
"""
|
48 |
-
|
49 |
-
generate_kwargs={"task": "transcribe", "language": "es"}
|
50 |
-
"""
|
51 |
-
|
52 |
-
# See the translation result
|
53 |
-
translate(sample["audio"].copy())
|
54 |
-
|
55 |
-
# Compare to raw text
|
56 |
-
sample["raw_text"]
|
57 |
-
|
58 |
-
"""### Text-to-Speech"""
|
59 |
|
60 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
61 |
-
|
62 |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
63 |
model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl")
|
64 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
65 |
|
66 |
-
"""Here we're using SpeechT5 checkpoint trained specifically for Dutch TTS from Bolakubus/speecht5_finetuned_voxpopuli_nl . Should you wish to translate into a language other than Dutch, either swap the checkpoint for a SpeechT5 TTS model fine-tuned on your language of choice, or use an MMS TTS checkpoint pre-trained in your target language."""
|
67 |
-
|
68 |
-
# Put the model and vocoder to GPU accelerator device if we have one
|
69 |
-
model.to(device)
|
70 |
-
vocoder.to(device)
|
71 |
-
|
72 |
# Load Speakers Embedding
|
73 |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
74 |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
75 |
|
76 |
-
"""We can now write a function that takes a text prompt as input, and generates the corresponding speech. We’ll first pre-process the text input using the SpeechT5 processor, tokenizing the text to get our input ids. We’ll then pass the input ids and speaker embeddings to the SpeechT5 model, placing each on the accelerator device if available. Finally, we’ll return the generated speech, bringing it back to the CPU so that we can play it back in our ipynb notebook:"""
|
77 |
-
|
78 |
def synthesize(text):
|
79 |
inputs = processor(text=text, return_tensors="pt")
|
80 |
speech = model.generate_speech(
|
@@ -82,13 +48,7 @@ def synthesize(text):
|
|
82 |
)
|
83 |
return speech.cpu()
|
84 |
|
85 |
-
|
86 |
-
speech = synthesize("This is a test")
|
87 |
-
|
88 |
-
Audio(speech, rate=16000)
|
89 |
-
|
90 |
-
"""### Creating Speech-to-Speech Translation (STST) Demo"""
|
91 |
-
|
92 |
import numpy as np
|
93 |
|
94 |
# Normalized Audio array by the dynamic range of the target dtype (int16)
|
@@ -106,8 +66,6 @@ sampling_rate, synthesized_speech = speech_to_speech_translation(sample["audio"]
|
|
106 |
|
107 |
Audio(synthesized_speech, rate=sampling_rate)
|
108 |
|
109 |
-
! pip install gradio
|
110 |
-
|
111 |
import gradio as gr
|
112 |
from gradio.mix import Series
|
113 |
|
|
|
6 |
Original file is located at
|
7 |
https://colab.research.google.com/drive/1AHToRlVpGAy3jQdbTm14tDdTyRPc-oG3
|
8 |
"""
|
9 |
+
"""Speech Translation to Text Part"""
|
10 |
|
11 |
+
from huggingface_hub import login
|
12 |
+
login("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")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
import torch
|
15 |
from transformers import pipeline
|
|
|
20 |
)
|
21 |
|
22 |
from datasets import load_dataset
|
|
|
23 |
dataset = load_dataset("facebook/voxpopuli", "nl", split="validation", streaming=True)
|
24 |
sample = next(iter(dataset))
|
25 |
|
26 |
from IPython.display import Audio
|
|
|
27 |
Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])
|
28 |
|
|
|
|
|
29 |
def translate(audio):
|
30 |
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
|
31 |
return outputs["text"]
|
32 |
|
33 |
+
"""Text-to-Speech Part"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
|
|
36 |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
37 |
model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl")
|
38 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
# Load Speakers Embedding
|
41 |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
42 |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
43 |
|
|
|
|
|
44 |
def synthesize(text):
|
45 |
inputs = processor(text=text, return_tensors="pt")
|
46 |
speech = model.generate_speech(
|
|
|
48 |
)
|
49 |
return speech.cpu()
|
50 |
|
51 |
+
"""Creating Speech-to-Speech Translation (STST) Demo"""
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
import numpy as np
|
53 |
|
54 |
# Normalized Audio array by the dynamic range of the target dtype (int16)
|
|
|
66 |
|
67 |
Audio(synthesized_speech, rate=sampling_rate)
|
68 |
|
|
|
|
|
69 |
import gradio as gr
|
70 |
from gradio.mix import Series
|
71 |
|