File size: 2,359 Bytes
0a6371e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from io import BytesIO
import soundfile as sf

# Load models outside of function calls for efficiency
def load_models():
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    return model, processor, vocoder

model, processor, vocoder = load_models()

# Load speaker embeddings
def get_speaker_embeddings():
    speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy")
    return torch.tensor(speaker_embeddings).unsqueeze(0)

speaker_embeddings = get_speaker_embeddings()

# Function to convert text to speech
def text_to_speech(text):
    try:
        # Segment the text if it's too long
        max_length = 100  # Set a max length as per model's capability
        segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
        combined_speech = []

        for segment in segments:
            inputs = processor(text=segment, return_tensors="pt")
            spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
            with torch.no_grad():
                speech = vocoder(spectrogram)
                combined_speech.extend(speech.numpy())

        # Combine audio data into a single numpy array
        combined_speech = np.array(combined_speech)

        return 16000, combined_speech  # Return sample rate and combined audio data
    except Exception as e:
        return None, f"Error in text-to-speech conversion: {e}"

# Gradio Interface
def gradio_interface(text):
    sample_rate, audio_data = text_to_speech(text)
    if sample_rate and isinstance(audio_data, np.ndarray):
        return sample_rate, audio_data
    else:
        return None  # Return None if there's an error

interface = gr.Interface(
    fn=gradio_interface,
    title="Text to Voice",  # Add a title to the interface
    description="Hight Fidelity TTS. Visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.",
    inputs=gr.Textbox(lines=10, label="Enter text to convert to speech"),
    outputs=gr.Audio(label="Generated audio")
)

interface.launch()