File size: 3,327 Bytes
d07996d
 
 
 
 
2dac140
d07996d
 
7d3cb0e
d07996d
 
 
 
 
 
 
 
 
7d3cb0e
d07996d
 
 
 
 
 
2dac140
d07996d
 
 
 
2dac140
d07996d
2dac140
d07996d
 
 
 
 
 
2dac140
d07996d
 
 
 
2dac140
d07996d
 
 
 
2dac140
d07996d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dac140
 
d07996d
2dac140
d07996d
 
 
 
2dac140
 
d07996d
 
 
 
2dac140
 
d07996d
 
 
 
 
2dac140
d07996d
 
2dac140
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import streamlit as st
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from io import StringIO
import soundfile as sf

# Load models outside of function calls for efficiency
@st.cache_data
def load_models():
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    return model, processor, vocoder

model, processor, vocoder = load_models()

# Load speaker embeddings
@st.cache_data
def get_speaker_embeddings():
    speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy")
    return torch.tensor(speaker_embeddings).unsqueeze(0)

speaker_embeddings = get_speaker_embeddings()

# Improved Styling
def local_css(file_name):
    with open(file_name) as f:
        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)

local_css("style.css")

# Streamlined Layout
st.title("Text-to-Voice Conversion")
st.markdown("Convert your text to speech using advanced AI models.")

# Function to convert text to speech
def text_to_speech(text):
    try:
        # Segment the text if it's too long
        max_length = 100  # Set a max length as per model's capability
        segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
        audio_paths = []

        for segment in segments:
            inputs = processor(text=segment, return_tensors="pt")
            spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
            with torch.no_grad():
                speech = vocoder(spectrogram)
                audio_path = f"speech_segment_{len(audio_paths)}.wav"
                sf.write(audio_path, speech.numpy(), samplerate=16000)
                audio_paths.append(audio_path)

        return audio_paths
    except Exception as e:
        st.error(f"Error in text-to-speech conversion: {e}")
        return []

# Function to combine audio segments
def combine_audio_segments(paths):
    combined_speech = []
    for path in paths:
        data, samplerate = sf.read(path)
        combined_speech.extend(data)
    sf.write("combined_speech.wav", np.array(combined_speech), samplerate)
    return "combined_speech.wav"

# Text Input
text = st.text_area("Type your text or upload a text file below.")

# Convert Button
if st.button("Convert"):
    if text:
        audio_paths = text_to_speech(text)
        combined_audio_path = combine_audio_segments(audio_paths)
        audio_file = open(combined_audio_path, 'rb')
        audio_bytes = audio_file.read()
        st.audio(audio_bytes, format='audio/wav')
    else:
        st.error("Please enter some text to convert.")

# File Uploader
uploaded_file = st.file_uploader("Upload your text file here", type=['txt'])
if uploaded_file is not None:
    stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
    text = stringio.read()
    st.write(text)

    if st.button("Convert Uploaded File", key=1):
        audio_paths = text_to_speech(text)
        combined_audio_path = combine_audio_segments(audio_paths)
        audio_file = open(combined_audio_path, 'rb')
        audio_bytes = audio_file.read()
        st.audio(audio_bytes, format='audio/wav')