import os import streamlit as st import time from kokoro import KPipeline import soundfile as sf import io st.title("Text-to-Speech with Kokoro Pipeline") st.markdown("Enter your text and configure options to generate audio segments.") # Text input for the content to be synthesized text = st.text_area( "Enter text", value="The sky above the port was the color of television, tuned to a dead channel.", height=150, ) # Voice selection - add more voice options as needed voice_options = { "American English (af_heart)": "af_heart", # You can add more voices here, for example: # "British English (b_voice)": "b_voice", # "Japanese (j_voice)": "j_voice", } voice_choice = st.selectbox("Select Voice", options=list(voice_options.keys())) voice = voice_options[voice_choice] # Slider for speech speed speed = st.slider("Speech Speed", min_value=0.5, max_value=2.0, value=1.0) if st.button("Generate Audio"): if not text.strip(): st.error("Please enter some text!") else: try: # Initialize the Kokoro pipeline. # Ensure that lang_code matches your chosen voice. with st.spinner("Initializing TTS pipeline..."): pipeline = KPipeline(lang_code='a') # Generate audio segments using the pipeline. with st.spinner("Generating audio..."): generator = pipeline( text, voice=voice, speed=speed, split_pattern=r'\n+' ) # Process and display each generated segment. segment_index = 0 for gs, ps, audio in generator: st.markdown(f"**Segment {segment_index}**") st.write("**Graphemes/Text:**", gs) st.write("**Phonemes:**", ps) # Convert the generated audio (assumed to be a numpy array) # to a WAV file in-memory so it can be played in Streamlit. audio_buffer = io.BytesIO() sf.write(audio_buffer, audio, 24000, format='WAV') audio_buffer.seek(0) st.audio(audio_buffer, format="audio/wav") segment_index += 1 st.success("Audio generation complete!") except Exception as e: st.error("An error occurred during audio generation.") st.exception(e)