# -*- coding: utf-8 -*- """sttToTts.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/15QqRKFSwfhRdnaj5-R1z6xFfeEOOta38 """ #text-to-speech and speech to text !pip install TTS !pip install transformers #text to speech from TTS.api import TTS tts = TTS("tts_models/multilingual/multi-dataset/your_tts", cs_api_model = "TTS.cs_api.CS_API", gpu=True) #voice recording import IPython.display import google.colab.output import base64 # all imports for voice recording from IPython.display import Javascript from google.colab import output from base64 import b64decode #to record sound, found on https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be RECORD = """ const sleep = time => new Promise(resolve => setTimeout(resolve, time)) const b2text = blob => new Promise(resolve => { const reader = new FileReader() reader.onloadend = e => resolve(e.srcElement.result) reader.readAsDataURL(blob) }) var record = time => new Promise(async resolve => { stream = await navigator.mediaDevices.getUserMedia({ audio: true }) recorder = new MediaRecorder(stream) chunks = [] recorder.ondataavailable = e => chunks.push(e.data) recorder.start() await sleep(time) recorder.onstop = async ()=>{ blob = new Blob(chunks) text = await b2text(blob) resolve(text) } recorder.stop() }) """ def record(name, sec): display(Javascript(RECORD)) s = output.eval_js('record(%d)' % (sec*1000)) b = b64decode(s.split(',')[1]) with open(f'{name}.webm','wb') as f: f.write(b) return (f'{name}.webm') # or webm ? #to record the text which is going to be transcribed record('audio', sec = 10) #works -- speech-to-text with an audio I provide the path to reach from transformers import WhisperProcessor, WhisperForConditionalGeneration import librosa # load model and processor processor = WhisperProcessor.from_pretrained("openai/whisper-small") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") model.config.forced_decoder_ids = None # load audio from a specific path audio_path = "audio.webm" audio_array, sampling_rate = librosa.load(audio_path, sr=16000) # "sr=16000" ensures that the sampling rate is as required # process the audio array input_features = processor(audio_array, sampling_rate, return_tensors="pt").input_features predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) print(transcription) #to record the speaker's voice used for tts record('speaker', sec = 10 ) #library to convert digits to words (ex : 1 --> one) import locale locale.getpreferredencoding = lambda: "UTF-8" !pip install inflect import re import inflect #because numbers under digit format are ignored otherwise def convert_numbers_to_words(s): p = inflect.engine() # Find all sequences of digits in the string numbers = re.findall(r'\d+', s) for number in numbers: # Convert each number to words words = p.number_to_words(number) # Replace the original number in the string with its word representation s = s.replace(number, words) return s #model test 1 for text to speech #works - text to speech with voice cloner (by providing the path to the audio where the voice is) from google.colab import drive from IPython.display import Audio tts.tts_to_file(text=convert_numbers_to_words(str(transcription)), file_path="output.wav", speaker_wav='speaker.webm', language="en", emotion ='angry', speed = 2) audio_path = "output.wav" Audio(audio_path) #model test 2 for text to speech from IPython.display import Audio # TTS with on the fly voice conversion api = TTS("tts_models/deu/fairseq/vits") api.tts_with_vc_to_file( text="Wie sage ich auf Italienisch, dass ich dich liebe?", speaker_wav="speaker.webm", file_path="ouptut.wav" ) audio_path = "output.wav" Audio(audio_path) #model test 3 for text to speech from TTS.api import TTS tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True) from IPython.display import Audio # generate speech by cloning a voice using custom settings tts.tts_to_file(text="But for me to rap like a computer it must be in my genes I got a laptop in my back pocket My pen'll go off when I half-cock it Got a fat knot from that rap profit Made a livin' and a killin' off it Ever since Bill Clinton was still in office with Monica Lewinsky feelin' on his nutsack I'm an MC still as honest", file_path="output.wav", speaker_wav="Slide 1.m4a", language="en", emotion = "neutral", decoder_iterations=35) audio_path = "output.wav" Audio(audio_path) # Init TTS with the target studio speaker tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) # Run TTS tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) # Run TTS with emotion and speed control tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) #model test 4 for text to speech from IPython.display import Audio from TTS.api import TTS #api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda") #api.tts_to_file("This is a test.", file_path="output.wav") # TTS with on the fly voice conversion api = TTS("tts_models/deu/fairseq/vits") api.tts_with_vc_to_file( "I am a basic human", speaker_wav="speaker.webm", file_path="output.wav" ) audio_path = "output.wav" Audio(audio_path)