Ahmed007 commited on
Commit
8ca6232
β€’
1 Parent(s): 7da1e09

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +6 -6
  2. app.py +139 -0
  3. example.wav +0 -0
  4. packages.txt +1 -0
  5. requirements.txt +6 -0
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: Test
3
- emoji: πŸ’»
4
- colorFrom: yellow
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 3.50.2
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Speech To Speech Translation
3
+ emoji: πŸ†
4
+ colorFrom: pink
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 3.36.1
8
  app_file: app.py
9
  pinned: false
10
+ duplicated_from: course-demos/speech-to-speech-translation
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/143eWt9oxUTcF59OBiVybOgKXJB3QOTsK
8
+ """
9
+
10
+ # Beginning of Unit 7
11
+
12
+ from transformers.models.markuplm.tokenization_markuplm import MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
13
+ import torch, torchaudio
14
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
15
+ import sentencepiece
16
+ from transformers import MarianMTModel, MarianTokenizer
17
+ from datasets import load_dataset
18
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
19
+ from IPython.display import Audio
20
+ import numpy as np
21
+
22
+ target_dtype = np.int16
23
+ max_range = np.iinfo(target_dtype).max
24
+
25
+ # Load Spanish Audio
26
+
27
+ def transcribe(audio):
28
+ model_id_asr = "openai/whisper-small"
29
+ processor_asr = WhisperProcessor.from_pretrained(model_id_asr)
30
+ model_asr = WhisperForConditionalGeneration.from_pretrained(model_id_asr)
31
+ model_asr.config.forced_decoder_ids = None
32
+
33
+ input_features = processor_asr(audio["audio"]["array"], sampling_rate=audio["audio"]["sampling_rate"], return_tensors="pt").input_features
34
+
35
+ predicted_ids = model_asr.generate(input_features)
36
+
37
+ # decode token ids to text
38
+ transcription = processor_asr.batch_decode(predicted_ids, skip_special_tokens=True)
39
+ return transcription[0]
40
+
41
+ # Run inference on Spanish Audio vector
42
+
43
+ def translate(text):
44
+ model_id_mt = "Helsinki-NLP/opus-mt-es-fr"
45
+ tokenizer_mt = MarianTokenizer.from_pretrained(model_id_mt)
46
+ model_mt = MarianMTModel.from_pretrained(model_id_mt)
47
+ # Tokenize the input text
48
+ input_ids = tokenizer_mt.encode(text, return_tensors="pt")
49
+
50
+ # Generate translation
51
+ with torch.no_grad():
52
+ translated_ids = model_mt.generate(input_ids)
53
+
54
+ # Decode the translated text
55
+ translated_text = tokenizer_mt.decode(translated_ids[0], skip_special_tokens=True)
56
+
57
+ return translated_text
58
+
59
+
60
+ def synthesise(text):
61
+
62
+ processor_tts = SpeechT5Processor.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")
63
+
64
+ model_tts = SpeechT5ForTextToSpeech.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")
65
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
66
+
67
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
68
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
69
+
70
+
71
+ # Load your dataset from Hugging Face
72
+ #embeddings_dataset = load_dataset("crowbarmassage/MAEmbed")
73
+ #print(embeddings_dataset.features)
74
+ #print(embeddings_dataset[0])
75
+
76
+ # Extract the embedding (assuming it's in a column named 'embedding')
77
+ # Note: Adjust the index [0] if your embedding is at a different position in the dataset.
78
+ #embedding_array = embeddings_dataset[0]['embedding']
79
+
80
+ # Convert the embedding to a PyTorch tensor and add a batch dimension
81
+ #speaker_embeddings = torch.tensor(embedding_array).unsqueeze(0)
82
+ print(speaker_embeddings)
83
+ print(type(speaker_embeddings))
84
+ inputs = processor_tts(text=text, return_tensors="pt")
85
+ speech = model_tts.generate_speech(
86
+ inputs["input_ids"], speaker_embeddings, vocoder=vocoder
87
+ )
88
+ print(speech)
89
+ print(len(speech))
90
+ print(torch.norm(speech))
91
+ return speech
92
+
93
+ def speech_to_speech_translation(audio_filepath):
94
+ # Load the audio file
95
+ waveform, sampling_rate = torchaudio.load(audio_filepath)
96
+
97
+ if sampling_rate != 16000:
98
+ resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
99
+ waveform = resampler(waveform)
100
+ sampling_rate = 16000
101
+ # Convert the waveform to a numpy array and construct the expected dictionary format
102
+ audio_dict = {
103
+ "audio": {
104
+ "array": waveform.numpy(),
105
+ "sampling_rate": sampling_rate
106
+ }
107
+ }
108
+ transcribed_text = transcribe(audio_dict)
109
+ translated_text = translate(transcribed_text)
110
+ synthesised_speech = synthesise(translated_text)
111
+ #print(transcribed_text)
112
+ #print(translated_text)
113
+ #print(synthesised_speech)
114
+ #print(torch.min(synthesised_speech), torch.max(synthesised_speech))
115
+ synthesised_speech = (synthesised_speech * 32767).numpy().astype(np.int16)
116
+ #print(synthesised_speech)
117
+ #print(np.min(synthesised_speech), np.max(synthesised_speech))
118
+ return 16000, synthesised_speech
119
+
120
+ import gradio as gr
121
+
122
+ demo = gr.Blocks()
123
+
124
+ mic_translate = gr.Interface(
125
+ fn=speech_to_speech_translation,
126
+ inputs=gr.Audio(source="microphone", type="filepath"),
127
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
128
+ )
129
+
130
+ file_translate = gr.Interface(
131
+ fn=speech_to_speech_translation,
132
+ inputs=gr.Audio(source="upload", type="filepath"),
133
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
134
+ )
135
+
136
+ with demo:
137
+ gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
138
+
139
+ demo.launch(debug=True, share=False)
example.wav ADDED
Binary file (263 kB). View file
 
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ git+https://github.com/huggingface/transformers
3
+ datasets
4
+ sentencepiece
5
+ torchaudio
6
+ IPython