import streamlit as st import os from glob import glob from pathlib import Path # from TTS.TTS.api import TTS from TTS.utils.synthesizer import Synthesizer from Wav2Lip.video_generator import create_video from diffusers import StableDiffusionPipeline from diffusers import LMSDiscreteScheduler gpu = False model_path = Path(r"tss_model/model_file.pth") config_path = Path(r"tss_model/config.json") vocoder_path = None vocoder_config_path = None model_dir = None language="en" file_path="generated_audio.wav" speaker = None split_sentences = True pipe_out = None # def get_synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, model_dir, gpu): synthesizer = Synthesizer( tts_checkpoint=model_path, tts_config_path=config_path, tts_speakers_file=None, tts_languages_file=None, vocoder_checkpoint=vocoder_path, vocoder_config=vocoder_config_path, encoder_checkpoint=None, encoder_config=None, model_dir=model_dir, use_cuda=gpu, ) # return synthesizer # synthesizer = get_synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, model_dir, gpu) def get_audio(synthesizer, speaker, language, speaker_wav, split_sentences, text): wav = synthesizer.tts( text=text, speaker_name=speaker, language_name=language, speaker_wav=speaker_wav, reference_wav=None, style_wav=None, style_text=None, reference_speaker_name=None, split_sentences=split_sentences ) synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out) # avatar_images_dir = Path('avatar_images') avatar_images_list = os.listdir('avatar_images') avatar_names_list = list(map(lambda x: x.split('.')[0], avatar_images_list)) n_cols_avatars = 3 n_rows_avatars = int((len(avatar_images_list) - len(avatar_images_list) % n_cols_avatars) / n_cols_avatars) if len(avatar_images_list) % n_cols_avatars != 0: n_rows_avatars += 1 voice_audio_list = os.listdir('voice_audios') voice_names_list = list(map(lambda x: x.split('.')[0], voice_audio_list)) n_cols_voices = 3 n_rows_voices = int((len(voice_audio_list) - len(voice_audio_list) % n_cols_voices) / n_cols_voices) if len(voice_audio_list) % n_cols_voices != 0: n_rows_voices += 1 st.set_page_config( page_title='Avatar service', layout='wide' ) st.markdown("

Avatar video generation

", unsafe_allow_html=True) # st.title('Avatar video generation') st.subheader('Step 1: Avatar Selection') with st.expander('Available avatars'): n_images_shown = 0 for i in range(n_rows_avatars): avatar_cols_list = st.columns(n_cols_avatars) for j in range(n_cols_avatars): avatar_cols_list[j].image( os.path.join('avatar_images', avatar_images_list[j+i*3]), width=150, caption=avatar_names_list[j+i*3] ) n_images_shown += 1 if n_images_shown == len(avatar_images_list): break def avatar_callback(): if st.session_state.avatar_image: st.session_state.selected_avatar = st.session_state.avatar_image if os.path.isfile('generated_avatar.jpg'): os.remove('generated_avatar.jpg') # if os.path.isfile('uploaded_avatar_image.jpg'): # os.remove('uploaded_avatar_image.jpg') def uploaded_avatar_callback(): if st.session_state.uploaded_avatar_image is None: pass else: image_path = "uploaded_avatar_image" + \ os.path.splitext(st.session_state.uploaded_avatar_image.name)[-1] with open(image_path, "wb") as f: f.write(st.session_state.uploaded_avatar_image.getvalue()) step1_col1, step1_col2 = st.columns(2) with step1_col1: selected_avatar = st.selectbox( label='Please select an avatar', options=avatar_names_list, key='avatar_image', on_change=avatar_callback ) st.write('or') uploaded_image = st.file_uploader( label='Please upload an avatar', type=['png', 'jpg', 'jpeg'], on_change=uploaded_avatar_callback, key='uploaded_avatar_image' ) st.write('or') st.text_area( label='Please type a prompt to generate an image for the avatar', key='image_prompt' ) def generate_avatar(): if st.session_state.avatar_generator: # if not os.path.exists('generated_avatars'): # os.mkdir('generated_avatars') pipe = StableDiffusionPipeline.from_pretrained(pretrained_model_name_or_path='diffusion_model') pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) pipe_output = pipe( prompt=st.session_state.image_prompt, # What to generate negative_prompt="Oversaturated, blurry, low quality, do not show head", # What NOT to generate height=480, width=640, # Specify the image size guidance_scale=13, # How strongly to follow the prompt num_inference_steps=40, # How many steps to take # generator=generator, # Fixed random seed ) pipe_output.images[0].save('generated_avatar.jpg') else: pass st.button( label='generate_avatar', key='avatar_generator', on_click=generate_avatar ) # st.write(st.session_state.avatar_generator) with step1_col2: if uploaded_image is not None: uploaded_avatar_image_path = glob('uploaded_avatar_image.*')[0] st.image(uploaded_avatar_image_path, width=300) elif len(glob('generated_avatar.*')) != 0: st.image('generated_avatar.jpg', width=300) else: st.image(os.path.join('avatar_images', avatar_images_list[avatar_names_list.index(selected_avatar)]), width=300) st.subheader('Step 2: Audio Selection') # st.markdown("
Option 1
", unsafe_allow_html=True) option1_expander = st.expander('Option 1') option1_expander.write( '''Please select or upload an audio with a voice you want to be used in the video. Then provide a text that will be used in the video. Afterwards click on button to get the audio which will be used in the video: please, take into account that depending on the size of the text it may take some time. ''' ) with st.expander('Available voice audio'): n_voices_shown = 0 for i in range(n_rows_voices): voice_cols_list = st.columns(n_cols_voices) for j in range(n_cols_avatars): voice_cols_list[j].audio( os.path.join('voice_audios', voice_audio_list[j+i*3]) ) voice_cols_list[j].write(voice_names_list[j+i*3]) n_voices_shown += 1 if n_voices_shown == len(voice_audio_list): break def voice_callback(): if st.session_state.voice_audio: st.session_state.selected_voice = st.session_state.voice_audio def uploaded_voice_callback(): if st.session_state.uploaded_voice_audio is None: pass else: audio_path = "uploaded_voice_audio" + \ os.path.splitext(st.session_state.uploaded_voice_audio.name)[-1] with open(audio_path, "wb") as f: f.write(st.session_state.uploaded_voice_audio.getvalue()) step21_col1, step21_col2 = st.columns(2) with step21_col1: selected_voice = st.selectbox( label='Please select a voice to clone', options=voice_names_list, key='voice_audio', on_change=voice_callback ) st.write('or') uploaded_voice = st.file_uploader( "Upload a voice to clone", type=['mp3', 'wav'], key='uploaded_voice_audio', on_change=uploaded_voice_callback ) with step21_col2: st.markdown('
', unsafe_allow_html=True) if uploaded_voice is None: st.audio(os.path.join('voice_audios', voice_audio_list[voice_names_list.index(selected_voice)])) else: uploaded_voice_audio_path = glob('uploaded_voice_audio.*')[0] st.audio(uploaded_voice_audio_path) step21txt_col1, step21txt_col2 = st.columns(2) with step21txt_col1: uploaded_txt = st.text_area( label='Please input text for avatar', key='txt4audio' ) def generate_audio(): if st.session_state.audio_button: if uploaded_voice is None: speaker_wav = os.path.join('voice_audios', voice_audio_list[voice_names_list.index(selected_voice)]) else: speaker_wav = "uploaded_voice_audio.mp3" get_audio( synthesizer, speaker, language, speaker_wav, split_sentences, text=st.session_state.txt4audio ) with step21txt_col2: st.markdown('
', unsafe_allow_html=True) st.button( label='Generate audio from text', key='audio_button', on_click=generate_audio ) if st.session_state.audio_button: gen_audio_col1, _ = st.columns(2) gen_audio_col1.audio("generated_audio.wav") # st.subheader('Step 2 - Option 2') option1_expander = st.expander('Option 2') option1_expander.write( '''Please, just upload an audio that will be reproduced in the video. ''' ) def uploaded_audio_callback(): if st.session_state.uploaded_audio is None: pass else: audio_path = "uploaded_audio" + \ os.path.splitext(st.session_state.uploaded_audio.name)[-1] with open(audio_path, "wb") as f: f.write(st.session_state.uploaded_audio.getvalue()) step22_col1, step22_col2 = st.columns(2) with step22_col1: uploaded_audio = st.file_uploader( "Please, upload an audio", type=['mp3', 'wav'], key='uploaded_audio', on_change=uploaded_audio_callback ) with step22_col2: st.markdown('
', unsafe_allow_html=True) if uploaded_audio is None: pass else: st.audio(glob('uploaded_audio.*')[0]) st.subheader('Step 3') def generate_video(): if st.session_state.video_button: if uploaded_audio is None: voice_audio = glob('generated_audio.*')[0] else: voice_audio = glob('uploaded_audio.*')[0] # if st.session_state.audio_button: # voice_audio = glob('generated_audio.*')[0] # else: # voice_audio = os.path.join('voice_audios', voice_audio_list[voice_names_list.index(selected_voice)]) if uploaded_image is not None: face = glob('uploaded_avatar_image.*')[0] elif len(glob('generated_avatar.*')) != 0: face = glob('generated_avatar.*')[0] else: face = os.path.join('avatar_images', avatar_images_list[avatar_names_list.index(selected_avatar)]) create_video(voice_audio, face) step3_button_col1, _, _ = st.columns([3, 4, 5]) with step3_button_col1: st.button( label='Generate video', key='video_button', on_click=generate_video ) if st.session_state.video_button: step3_col1, _, _ = st.columns([4, 3, 5]) with step3_col1: st.video( # os.path.join('avatar_videos', 'generated_video.mp4') 'generated_video.mp4' ) # with step3_col2: # # st.markdown('
', unsafe_allow_html=True) # # with open(os.path.join('avatar_videos', 'generated_video.mp4'), 'rb') as file: # with open('generated_video.mp4', 'rb') as file: # st.download_button( # label='Download generated video', # data=file, # file_name='avatar_video.mp4', # mime='video/mp4' # )