Spaces:

Ahmed007
/

test

Sleeping

App Files Files Community

Ahmed007 commited on Oct 20, 2023

Commit

8ca6232

•

1 Parent(s): 7da1e09

Upload 5 files

Browse files

Files changed (5) hide show

README.md +6 -6
app.py +139 -0
example.wav +0 -0
packages.txt +1 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: Test
-emoji: 💻
-colorFrom: yellow
-colorTo: gray
 sdk: gradio
-sdk_version: 3.50.2
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Speech To Speech Translation
+emoji: 🏆
+colorFrom: pink
+colorTo: indigo
 sdk: gradio
+sdk_version: 3.36.1
 app_file: app.py
 pinned: false
+duplicated_from: course-demos/speech-to-speech-translation
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# -*- coding: utf-8 -*-
+"""app.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/143eWt9oxUTcF59OBiVybOgKXJB3QOTsK
+"""
+# Beginning of Unit 7
+from transformers.models.markuplm.tokenization_markuplm import MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
+import torch, torchaudio
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import sentencepiece
+from transformers import MarianMTModel, MarianTokenizer
+from datasets import load_dataset
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from IPython.display import Audio
+import numpy as np
+target_dtype = np.int16
+max_range = np.iinfo(target_dtype).max
+# Load Spanish Audio
+def transcribe(audio):
+    model_id_asr = "openai/whisper-small"
+    processor_asr = WhisperProcessor.from_pretrained(model_id_asr)
+    model_asr = WhisperForConditionalGeneration.from_pretrained(model_id_asr)
+    model_asr.config.forced_decoder_ids = None
+    input_features = processor_asr(audio["audio"]["array"], sampling_rate=audio["audio"]["sampling_rate"], return_tensors="pt").input_features
+    predicted_ids = model_asr.generate(input_features)
+    # decode token ids to text
+    transcription = processor_asr.batch_decode(predicted_ids, skip_special_tokens=True)
+    return transcription[0]
+# Run inference on Spanish Audio vector
+def translate(text):
+    model_id_mt = "Helsinki-NLP/opus-mt-es-fr"
+    tokenizer_mt = MarianTokenizer.from_pretrained(model_id_mt)
+    model_mt = MarianMTModel.from_pretrained(model_id_mt)
+    # Tokenize the input text
+    input_ids = tokenizer_mt.encode(text, return_tensors="pt")
+    # Generate translation
+    with torch.no_grad():
+        translated_ids = model_mt.generate(input_ids)
+    # Decode the translated text
+    translated_text = tokenizer_mt.decode(translated_ids[0], skip_special_tokens=True)
+    return translated_text
+def synthesise(text):
+    processor_tts = SpeechT5Processor.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")
+    model_tts = SpeechT5ForTextToSpeech.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+    # Load your dataset from Hugging Face
+    #embeddings_dataset = load_dataset("crowbarmassage/MAEmbed")
+    #print(embeddings_dataset.features)
+    #print(embeddings_dataset[0])
+    # Extract the embedding (assuming it's in a column named 'embedding')
+    # Note: Adjust the index [0] if your embedding is at a different position in the dataset.
+    #embedding_array = embeddings_dataset[0]['embedding']
+    # Convert the embedding to a PyTorch tensor and add a batch dimension
+    #speaker_embeddings = torch.tensor(embedding_array).unsqueeze(0)
+    print(speaker_embeddings)
+    print(type(speaker_embeddings))
+    inputs = processor_tts(text=text, return_tensors="pt")
+    speech = model_tts.generate_speech(
+        inputs["input_ids"], speaker_embeddings, vocoder=vocoder
+    )
+    print(speech)
+    print(len(speech))
+    print(torch.norm(speech))
+    return speech
+def speech_to_speech_translation(audio_filepath):
+    # Load the audio file
+    waveform, sampling_rate = torchaudio.load(audio_filepath)
+    if sampling_rate != 16000:
+      resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
+      waveform = resampler(waveform)
+      sampling_rate = 16000
+    # Convert the waveform to a numpy array and construct the expected dictionary format
+    audio_dict = {
+        "audio": {
+            "array": waveform.numpy(),
+            "sampling_rate": sampling_rate
+        }
+    }
+    transcribed_text = transcribe(audio_dict)
+    translated_text = translate(transcribed_text)
+    synthesised_speech = synthesise(translated_text)
+    #print(transcribed_text)
+    #print(translated_text)
+    #print(synthesised_speech)
+    #print(torch.min(synthesised_speech), torch.max(synthesised_speech))
+    synthesised_speech = (synthesised_speech * 32767).numpy().astype(np.int16)
+    #print(synthesised_speech)
+    #print(np.min(synthesised_speech), np.max(synthesised_speech))
+    return 16000, synthesised_speech
+import gradio as gr
+demo = gr.Blocks()
+mic_translate = gr.Interface(
+    fn=speech_to_speech_translation,
+    inputs=gr.Audio(source="microphone", type="filepath"),
+    outputs=gr.Audio(label="Generated Speech", type="numpy"),
+)
+file_translate = gr.Interface(
+    fn=speech_to_speech_translation,
+    inputs=gr.Audio(source="upload", type="filepath"),
+    outputs=gr.Audio(label="Generated Speech", type="numpy"),
+)
+with demo:
+    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
+demo.launch(debug=True, share=False)

example.wav ADDED Viewed

Binary file (263 kB). View file

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+git+https://github.com/huggingface/transformers
+datasets
+sentencepiece
+torchaudio
+IPython