File size: 1,363 Bytes
3f312c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import gradio as gr
import tensorflow as tf
from tensorflow_tts.inference import TFAutoModel, AutoProcessor
import soundfile as sf

# Load the model and processor
lightspeech = TFAutoModel.from_pretrained("bookbot/lightspeech-mfa-sw-v4")
processor = AutoProcessor.from_pretrained("bookbot/lightspeech-mfa-sw-v4")
mb_melgan = TFAutoModel.from_pretrained("bookbot/mb-melgan-hifi-postnets-sw-v4")


def tts(text, speaker_name="sw-TZ-Victoria"):
    # Process input text
    input_ids = processor.text_to_sequence(text)

    # Generate mel-spectrogram
    mel, _, _ = lightspeech.inference(
        input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
        speaker_ids=tf.convert_to_tensor(
            [processor.speakers_map[speaker_name]], dtype=tf.int32
        ),
        speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
        f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
        energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
    )

    # Generate audio from mel-spectrogram
    audio = mb_melgan.inference(mel)[0, :, 0]

    # Save to file
    sf.write("output.wav", audio, 44100, "PCM_16")

    # Return the audio file for Gradio to play
    return "output.wav"


# Create a Gradio interface
iface = gr.Interface(fn=tts, inputs="text", outputs="audio")

# Launch the interface
iface.launch()