# -*- coding: utf-8 -*- """app.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/143eWt9oxUTcF59OBiVybOgKXJB3QOTsK """ # Beginning of Unit 7 from transformers.models.markuplm.tokenization_markuplm import MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING import torch, torchaudio from transformers import WhisperProcessor, WhisperForConditionalGeneration import sentencepiece from transformers import MarianMTModel, MarianTokenizer from datasets import load_dataset from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from IPython.display import Audio import numpy as np target_dtype = np.int16 max_range = np.iinfo(target_dtype).max # Load Spanish Audio def transcribe(audio): model_id_asr = "openai/whisper-small" processor_asr = WhisperProcessor.from_pretrained(model_id_asr) model_asr = WhisperForConditionalGeneration.from_pretrained(model_id_asr) model_asr.config.forced_decoder_ids = None input_features = processor_asr(audio["audio"]["array"], sampling_rate=audio["audio"]["sampling_rate"], return_tensors="pt").input_features predicted_ids = model_asr.generate(input_features) # decode token ids to text transcription = processor_asr.batch_decode(predicted_ids, skip_special_tokens=True) return transcription[0] # Run inference on Spanish Audio vector def translate(text): model_id_mt = "Helsinki-NLP/opus-mt-es-fr" tokenizer_mt = MarianTokenizer.from_pretrained(model_id_mt) model_mt = MarianMTModel.from_pretrained(model_id_mt) # Tokenize the input text input_ids = tokenizer_mt.encode(text, return_tensors="pt") # Generate translation with torch.no_grad(): translated_ids = model_mt.generate(input_ids) # Decode the translated text translated_text = tokenizer_mt.decode(translated_ids[0], skip_special_tokens=True) return translated_text def synthesise(text): processor_tts = SpeechT5Processor.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr") model_tts = SpeechT5ForTextToSpeech.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) # Load your dataset from Hugging Face #embeddings_dataset = load_dataset("crowbarmassage/MAEmbed") #print(embeddings_dataset.features) #print(embeddings_dataset[0]) # Extract the embedding (assuming it's in a column named 'embedding') # Note: Adjust the index [0] if your embedding is at a different position in the dataset. #embedding_array = embeddings_dataset[0]['embedding'] # Convert the embedding to a PyTorch tensor and add a batch dimension #speaker_embeddings = torch.tensor(embedding_array).unsqueeze(0) print(speaker_embeddings) print(type(speaker_embeddings)) inputs = processor_tts(text=text, return_tensors="pt") speech = model_tts.generate_speech( inputs["input_ids"], speaker_embeddings, vocoder=vocoder ) print(speech) print(len(speech)) print(torch.norm(speech)) return speech def speech_to_speech_translation(audio_filepath): # Load the audio file waveform, sampling_rate = torchaudio.load(audio_filepath) if sampling_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000) waveform = resampler(waveform) sampling_rate = 16000 # Convert the waveform to a numpy array and construct the expected dictionary format audio_dict = { "audio": { "array": waveform.numpy(), "sampling_rate": sampling_rate } } transcribed_text = transcribe(audio_dict) translated_text = translate(transcribed_text) synthesised_speech = synthesise(translated_text) #print(transcribed_text) #print(translated_text) #print(synthesised_speech) #print(torch.min(synthesised_speech), torch.max(synthesised_speech)) synthesised_speech = (synthesised_speech * 32767).numpy().astype(np.int16) #print(synthesised_speech) #print(np.min(synthesised_speech), np.max(synthesised_speech)) return 16000, synthesised_speech import gradio as gr demo = gr.Blocks() mic_translate = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(source="microphone", type="filepath"), outputs=gr.Audio(label="Generated Speech", type="numpy"), ) file_translate = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(source="upload", type="filepath"), outputs=gr.Audio(label="Generated Speech", type="numpy"), ) with demo: gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"]) demo.launch(debug=True, share=False)