File size: 2,377 Bytes
f5cf172
 
 
 
 
1e99d58
f5cf172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e4ef3d
1e99d58
 
 
 
 
f5cf172
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import IPython
from huggingface_hub.inference_api import InferenceApi
import torch
from TTS.api import TTS
import wave
import espeakng 
import subprocess
from scipy.io import wavfile
from transformers import pipeline
import os

def synth_mms(text:str, model:str):
    '''
    Use Huggingface inference pipeline to synthesize text.
    (Can be replaced by inference API, but that requires stored API token.)

    Inputs:
        text: Text to synthesze
        model: Model code of the form mms-tts-LAN
    Returns:
        Streaming numpy and sampling rate.
    '''
    #inference = InferenceApi(repo_id=f"facebook/{model}", 
    #                         token=API_TOKEN)
    #mms_tts = inference(inputs=text, 
    #                    raw_response=True)._content

    if model is not None:
        pipe = pipeline("text-to-speech", model=model, device=-1) # Change device if it should use GPU
        mms_tts = pipe(text)
        return mms_tts['audio'], mms_tts['sampling_rate']
    else:
        return None



def synth_coqui(text:str, model:str):
    '''
    Use Coqui inference API to synthesize text.

    Inputs:
        text: Text to synthesze
        model: Model code 
    Returns:
        Streaming Wav and sampling rate.
    '''
    if model is not None:
        # Get device
        device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Init TTS
        tts = TTS(model, progress_bar=False).to(device)
    
        tts.tts_to_file(text=text, file_path="test.wav", is_multi_speaker=False)
        
        sampling_rate, wav = wavfile.read('test.wav')
        os.remove("test.wav")
        
        #wav = tts.tts(text=text)
        return wav, sampling_rate
    else:
        return None


def synth_espeakng(text:str, model:str):
    '''
    Use ESpeak-NG to synthesize text.

    Inputs:
        text: Text to synthesze
        model: Model code 
    Returns:
        Streaming Wav and sampling rate.
    '''
    if model is not None:
        
        #subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]) #.returncode
        esng = espeakng.Speaker()
        esng.voice = model
        esng.say(text, export_path="test.wav")
        
        sampling_rate, wav = wavfile.read('test.wav')
        os.remove("test.wav")
        
        #wav = tts.tts(text=text)
        return wav, sampling_rate
    else:
        return None