Bolakubus commited on
Commit
293ecc9
1 Parent(s): 3038346

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -47
app.py CHANGED
@@ -6,20 +6,10 @@ Automatically generated by Colaboratory.
6
  Original file is located at
7
  https://colab.research.google.com/drive/1AHToRlVpGAy3jQdbTm14tDdTyRPc-oG3
8
  """
 
9
 
10
- ! pip install git+https://github.com/huggingface/transformers.git
11
-
12
- ! pip install torch
13
-
14
- ! pip install --upgrade accelerate
15
-
16
- ! pip install datasets soundfile speechbrain
17
-
18
- """### Speech Translation to Text"""
19
-
20
- from huggingface_hub import notebook_login
21
-
22
- notebook_login()
23
 
24
  import torch
25
  from transformers import pipeline
@@ -30,51 +20,27 @@ pipe = pipeline(
30
  )
31
 
32
  from datasets import load_dataset
33
-
34
  dataset = load_dataset("facebook/voxpopuli", "nl", split="validation", streaming=True)
35
  sample = next(iter(dataset))
36
 
37
  from IPython.display import Audio
38
-
39
  Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])
40
 
41
- # Function to generate task argument "translate" for speech translation
42
- # Recall that "transcribe" task for Speech Recognition
43
  def translate(audio):
44
  outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
45
  return outputs["text"]
46
 
47
- """Whisper can also be ‘tricked’ into translating from speech in any language X to any language Y. Simply set the task to "transcribe" and the "language" to your target language in the generation key-word arguments, e.g. for Spanish, one would set:
48
-
49
- generate_kwargs={"task": "transcribe", "language": "es"}
50
- """
51
-
52
- # See the translation result
53
- translate(sample["audio"].copy())
54
-
55
- # Compare to raw text
56
- sample["raw_text"]
57
-
58
- """### Text-to-Speech"""
59
 
60
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
61
-
62
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
63
  model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl")
64
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
65
 
66
- """Here we're using SpeechT5 checkpoint trained specifically for Dutch TTS from Bolakubus/speecht5_finetuned_voxpopuli_nl . Should you wish to translate into a language other than Dutch, either swap the checkpoint for a SpeechT5 TTS model fine-tuned on your language of choice, or use an MMS TTS checkpoint pre-trained in your target language."""
67
-
68
- # Put the model and vocoder to GPU accelerator device if we have one
69
- model.to(device)
70
- vocoder.to(device)
71
-
72
  # Load Speakers Embedding
73
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
74
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
75
 
76
- """We can now write a function that takes a text prompt as input, and generates the corresponding speech. We’ll first pre-process the text input using the SpeechT5 processor, tokenizing the text to get our input ids. We’ll then pass the input ids and speaker embeddings to the SpeechT5 model, placing each on the accelerator device if available. Finally, we’ll return the generated speech, bringing it back to the CPU so that we can play it back in our ipynb notebook:"""
77
-
78
  def synthesize(text):
79
  inputs = processor(text=text, return_tensors="pt")
80
  speech = model.generate_speech(
@@ -82,13 +48,7 @@ def synthesize(text):
82
  )
83
  return speech.cpu()
84
 
85
- # Dummy Check
86
- speech = synthesize("This is a test")
87
-
88
- Audio(speech, rate=16000)
89
-
90
- """### Creating Speech-to-Speech Translation (STST) Demo"""
91
-
92
  import numpy as np
93
 
94
  # Normalized Audio array by the dynamic range of the target dtype (int16)
@@ -106,8 +66,6 @@ sampling_rate, synthesized_speech = speech_to_speech_translation(sample["audio"]
106
 
107
  Audio(synthesized_speech, rate=sampling_rate)
108
 
109
- ! pip install gradio
110
-
111
  import gradio as gr
112
  from gradio.mix import Series
113
 
 
6
  Original file is located at
7
  https://colab.research.google.com/drive/1AHToRlVpGAy3jQdbTm14tDdTyRPc-oG3
8
  """
9
+ """Speech Translation to Text Part"""
10
 
11
+ from huggingface_hub import login
12
+ login("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")
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  import torch
15
  from transformers import pipeline
 
20
  )
21
 
22
  from datasets import load_dataset
 
23
  dataset = load_dataset("facebook/voxpopuli", "nl", split="validation", streaming=True)
24
  sample = next(iter(dataset))
25
 
26
  from IPython.display import Audio
 
27
  Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])
28
 
 
 
29
  def translate(audio):
30
  outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
31
  return outputs["text"]
32
 
33
+ """Text-to-Speech Part"""
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 
36
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
37
  model = SpeechT5ForTextToSpeech.from_pretrained("Bolakubus/speecht5_finetuned_voxpopuli_nl")
38
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
39
 
 
 
 
 
 
 
40
  # Load Speakers Embedding
41
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
42
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
43
 
 
 
44
  def synthesize(text):
45
  inputs = processor(text=text, return_tensors="pt")
46
  speech = model.generate_speech(
 
48
  )
49
  return speech.cpu()
50
 
51
+ """Creating Speech-to-Speech Translation (STST) Demo"""
 
 
 
 
 
 
52
  import numpy as np
53
 
54
  # Normalized Audio array by the dynamic range of the target dtype (int16)
 
66
 
67
  Audio(synthesized_speech, rate=sampling_rate)
68
 
 
 
69
  import gradio as gr
70
  from gradio.mix import Series
71