import torch
import torchaudio
from transformers import pipeline
import streamlit as st

model_id = '11mlabs/indri-0.1-124m-tts'
task = 'indri-tts'

pipe = pipeline(
    task,
    model=model_id,
    #device=torch.device('cuda:0'), # Update this based on your hardware,
    trust_remote_code=True
)

st.title("Indri")
st.subheader("Ultrafast multi-modal AI")

# Add some spacing and a description
st.markdown("<h3 style='text-align: center;'>Text-to-Speech Application</h3>", unsafe_allow_html=True)
st.markdown("<p style='text-align: center;'>Select a speaker and enter text to generate audio.</p>", unsafe_allow_html=True)

speakers = {
    "[spkr_63]" : "🇬🇧 👨 book reader",
    "[spkr_67]" : "🇺🇸 👨 influencer",
    "[spkr_68]" : "🇮🇳 👨 book reader",
    "[spkr_69]" : "🇮🇳 👨 book reader",
    "[spkr_70]" : "🇮🇳 👨 motivational speaker",
    "[spkr_62]" : "🇮🇳 👨 book reader heavy",
    "[spkr_53]" : "🇮🇳 👩 recipe reciter",
    "[spkr_60]" : "🇮🇳 👩 book reader",
    "[spkr_74]" : "🇺🇸 👨 book reader",
    "[spkr_75]" : "🇮🇳 👨 entrepreneur",
    "[spkr_76]" : "🇬🇧 👨 nature lover",
    "[spkr_77]" : "🇮🇳 👨 influencer",
    "[spkr_66]" : "🇮🇳 👨 politician"
}

# Create a container for the speaker selection and text input
with st.container():
    st.markdown("### Speaker Selection")
    speaker_id = st.selectbox("Select a speaker:", options=list(speakers.keys()), format_func=lambda x: speakers[x])

    st.markdown("### Text Input")
    text_input = st.text_area("Enter text for TTS (max 200 characters):", max_chars=200)

if st.button("Generate Audio", key="generate_audio"):
    if text_input:
        output = pipe([text_input], speaker=speaker_id)  
        torchaudio.save('output.wav', output[0]['audio'][0], sample_rate=24000)
        st.audio('output.wav')  
    else:
        st.warning("Please enter text to generate audio.")