swahili_tts / app.py
awkondoro's picture
initial commit
3f312c5
raw
history blame
1.36 kB
import gradio as gr
import tensorflow as tf
from tensorflow_tts.inference import TFAutoModel, AutoProcessor
import soundfile as sf
# Load the model and processor
lightspeech = TFAutoModel.from_pretrained("bookbot/lightspeech-mfa-sw-v4")
processor = AutoProcessor.from_pretrained("bookbot/lightspeech-mfa-sw-v4")
mb_melgan = TFAutoModel.from_pretrained("bookbot/mb-melgan-hifi-postnets-sw-v4")
def tts(text, speaker_name="sw-TZ-Victoria"):
# Process input text
input_ids = processor.text_to_sequence(text)
# Generate mel-spectrogram
mel, _, _ = lightspeech.inference(
input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
speaker_ids=tf.convert_to_tensor(
[processor.speakers_map[speaker_name]], dtype=tf.int32
),
speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
)
# Generate audio from mel-spectrogram
audio = mb_melgan.inference(mel)[0, :, 0]
# Save to file
sf.write("output.wav", audio, 44100, "PCM_16")
# Return the audio file for Gradio to play
return "output.wav"
# Create a Gradio interface
iface = gr.Interface(fn=tts, inputs="text", outputs="audio")
# Launch the interface
iface.launch()