tts_mockingbird

Running

File size: 2,377 Bytes

import IPython
from huggingface_hub.inference_api import InferenceApi
import torch
from TTS.api import TTS
import wave
import espeakng 
import subprocess
from scipy.io import wavfile
from transformers import pipeline
import os

def synth_mms(text:str, model:str):
    '''
    Use Huggingface inference pipeline to synthesize text.
    (Can be replaced by inference API, but that requires stored API token.)

    Inputs:
        text: Text to synthesze
        model: Model code of the form mms-tts-LAN
    Returns:
        Streaming numpy and sampling rate.
    '''
    #inference = InferenceApi(repo_id=f"facebook/{model}", 
    #                         token=API_TOKEN)
    #mms_tts = inference(inputs=text, 
    #                    raw_response=True)._content

    if model is not None:
        pipe = pipeline("text-to-speech", model=model, device=-1) # Change device if it should use GPU
        mms_tts = pipe(text)
        return mms_tts['audio'], mms_tts['sampling_rate']
    else:
        return None



def synth_coqui(text:str, model:str):
    '''
    Use Coqui inference API to synthesize text.

    Inputs:
        text: Text to synthesze
        model: Model code 
    Returns:
        Streaming Wav and sampling rate.
    '''
    if model is not None:
        # Get device
        device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Init TTS
        tts = TTS(model, progress_bar=False).to(device)
    
        tts.tts_to_file(text=text, file_path="test.wav", is_multi_speaker=False)
        
        sampling_rate, wav = wavfile.read('test.wav')
        os.remove("test.wav")
        
        #wav = tts.tts(text=text)
        return wav, sampling_rate
    else:
        return None


def synth_espeakng(text:str, model:str):
    '''
    Use ESpeak-NG to synthesize text.

    Inputs:
        text: Text to synthesze
        model: Model code 
    Returns:
        Streaming Wav and sampling rate.
    '''
    if model is not None:
        
        #subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]) #.returncode
        esng = espeakng.Speaker()
        esng.voice = model
        esng.say(text, export_path="test.wav")
        
        sampling_rate, wav = wavfile.read('test.wav')
        os.remove("test.wav")
        
        #wav = tts.tts(text=text)
        return wav, sampling_rate
    else:
        return None