Spaces:
Running
Running
File size: 4,454 Bytes
4d909dd d35ea54 a8c9931 6abe9d1 c79a608 6abe9d1 d35ea54 4d909dd 9cbe47c 7b6b779 78953ab d35ea54 78953ab d35ea54 78953ab d35ea54 a8c9931 e17cc19 37f1e48 78953ab a8c9931 37f1e48 a8c9931 37f1e48 ae516d2 78953ab a8c9931 d35ea54 a8c9931 41d2d4a a8c9931 78953ab a8c9931 d35ea54 78953ab ae516d2 78953ab 710c1d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import streamlit as st
import os
import sys
import torch
import pickle
import numpy
print(numpy.__version__)
import librosa
print(librosa.__version__)
import numpy
print(numpy.__version__)
path_to_add = os.path.join(os.path.dirname(__file__), "Wav2Lip")
if path_to_add not in sys.path:
sys.path.insert(0, path_to_add)
from avatar import Avatar
options = ['Aude', 'Kyla', 'Liv', 'MC6']
images = ['ref_videos/Aude.png', 'ref_videos/Kyla.png', 'ref_videos/Liv.png', 'ref_videos/MC6.png']
big_text = """
<div style='text-align: center;'>
<h1 style='font-size: 30x;'>Text to Speech Synchronized Video</h1>
</div>
"""
# Display the styled text
st.markdown(big_text, unsafe_allow_html=True)
current_status_placeholder = st.empty()
init_progress_bar = st.progress(0)
if 'is_initialized' not in st.session_state:
st.session_state.avatar = Avatar()
st.session_state.avatar.export_video = False
current_status_placeholder.write("load model")
st.session_state.avatar.load_model("checkpoint/wav2lip_gan.pth")
current_status_placeholder.write("load model finished")
st.session_state.avatar.device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(st.session_state.avatar.device)
st.session_state.avatar.output_audio_path = "audio/"
st.session_state.avatar.output_audio_filename = "result.wav"
st.session_state.avatar.temp_lip_video_no_voice_path = "temp/"
st.session_state.avatar.temp_lip_video_no_voice_filename = "result.avi"
st.session_state.avatar.output_video_path = "results/"
st.session_state.avatar.output_video_name = "result_voice.mp4"
st.session_state.selected_option = "Liv"
st.session_state.avatar.ref_video_path_and_filename = f"ref_videos/{st.session_state.selected_option}.mp4"
st.session_state.avatar.get_video_full_frames(st.session_state.avatar.ref_video_path_and_filename)
st.session_state.avatar.face_detect_batch_size = 16
avatar.create_face_detection_results(avatar.video_full_frames,True)
current_status_placeholder.write("load face detection result")
st.session_state.face_det_results_dict={}
for option in options:
with open(f'ref_videos/{option}_face_det_result.pkl', 'rb') as file:
st.session_state.face_det_results_dict[option] = pickle.load(file)
st.session_state.avatar.face_detect_img_results =st.session_state.face_det_results_dict[st.session_state.selected_option]
st.session_state.avatar.face_det_results_path_and_name = 'ref_videos/Liv_face_det_result.pkl'
st.session_state.avatar.load_face_detection_results()
def load_face_detection_results(self):
with open(self.face_det_results_path_and_name, 'rb') as file:
self.face_detect_img_results = pickle.load(file)
input_text = "Hi How are you?"
st.session_state.avatar.text_to_lip_video(input_text,init_progress_bar)
current_status_placeholder.write("load face detection result done")
st.session_state['is_initialized'] = True
# Create the radio button group
selected_option = st.radio("Choose an option:", options, index=options.index(st.session_state.selected_option))
img_col1, img_col2 = st.columns([1,1])
with img_col1:
st.image(images[options.index(selected_option)])
if st.session_state.selected_option != selected_option:
print("The selected option has changed!")
st.session_state.selected_option = selected_option
st.session_state.avatar.ref_video_path_and_filename = f"ref_videos/{st.session_state.selected_option}.mp4"
st.session_state.avatar.get_video_full_frames(st.session_state.avatar.ref_video_path_and_filename)
st.session_state.avatar.face_detect_img_results =st.session_state.face_det_results_dict[st.session_state.selected_option]
from avatar import Avatar
# Create a text input box and store the input in a variable
user_input = st.text_input("Enter your text:")
inference_progress_bar = st.progress(0)
if user_input:
st.session_state.avatar.dir_clean_up()
# Display the entered text
st.write("You entered:", user_input)
st.session_state.avatar.export_video=True
st.session_state.avatar.text_to_lip_video(user_input,inference_progress_bar)
col1, col2, col3 = st.columns([1, 4, 1])
# with col1:
# st.write("Column 1 content")
with col2:
st.video(st.session_state.avatar.output_video_path + st.session_state.avatar.output_video_name)
# with col3:
# st.write("Column 3 content")
|