import torch import torchaudio from transformers import pipeline import streamlit as st model_id = '11mlabs/indri-0.1-124m-tts' task = 'indri-tts' pipe = pipeline( task, model=model_id, #device=torch.device('cuda:0'), # Update this based on your hardware, trust_remote_code=True ) st.title("Indri") st.subheader("Ultrafast multi-modal AI") # Add some spacing and a description st.markdown("
Select a speaker and enter text to generate audio.
", unsafe_allow_html=True) speakers = { "[spkr_63]" : "🇬🇧 👨 book reader", "[spkr_67]" : "🇺🇸 👨 influencer", "[spkr_68]" : "🇮🇳 👨 book reader", "[spkr_69]" : "🇮🇳 👨 book reader", "[spkr_70]" : "🇮🇳 👨 motivational speaker", "[spkr_62]" : "🇮🇳 👨 book reader heavy", "[spkr_53]" : "🇮🇳 👩 recipe reciter", "[spkr_60]" : "🇮🇳 👩 book reader", "[spkr_74]" : "🇺🇸 👨 book reader", "[spkr_75]" : "🇮🇳 👨 entrepreneur", "[spkr_76]" : "🇬🇧 👨 nature lover", "[spkr_77]" : "🇮🇳 👨 influencer", "[spkr_66]" : "🇮🇳 👨 politician" } # Create a container for the speaker selection and text input with st.container(): st.markdown("### Speaker Selection") speaker_id = st.selectbox("Select a speaker:", options=list(speakers.keys()), format_func=lambda x: speakers[x]) st.markdown("### Text Input") text_input = st.text_area("Enter text for TTS (max 200 characters):", max_chars=200) if st.button("Generate Audio", key="generate_audio"): if text_input: output = pipe([text_input], speaker=speaker_id) torchaudio.save('output.wav', output[0]['audio'][0], sample_rate=24000) st.audio('output.wav') else: st.warning("Please enter text to generate audio.")