text_to_speech_sync_video

Running

File size: 4,915 Bytes

4d909dd
d35ea54
 
 
a8c9931
6abe9d1
 
74076e9
d35ea54
4d909dd
74076e9
 
 
 
 
 
 
 
 
f9a70bb
 
 
74076e9
 
f9a70bb
74076e9
 
 
 
 
 
 
 
 
 
 
 
 
2b8678f
9cbe47c
 
74076e9
069100e
 
 
 
2b8678f
 
7b6b779
 
 
 
2b8678f
7b6b779
78953ab
 
2b8678f
 
d35ea54
74076e9
 
2b8678f
d35ea54
 
2b8678f
 
74076e9
d35ea54
74076e9
2b8678f
 
d35ea54
74076e9
2b8678f
 
d35ea54
 
 
 
 
 
2b8678f
 
a8c9931
 
2b8678f
 
e17cc19
 
2b8678f
 
 
a8c9931
 
 
2b8678f
 
 
 
 
ae516d2
2b8678f
74076e9
2b8678f
d35ea54
 
2b8678f
a8c9931
41d2d4a
 
 
 
a8c9931
2b8678f
a8c9931
 
 
 
2b8678f
a8c9931
2b8678f

import streamlit as st
import os
import sys
import torch
import pickle
import numpy
import librosa
import subprocess
from avatar import Avatar

def run_pickleface():
    try:
        result = subprocess.run(
            ['python', 'pickleface.py'],
            check=True,
            capture_output=True,
            text=True
        )
        print(result.stdout)
        if result.returncode != 0:
            st.error(f"Error creating face detection results: {result.stderr}")
            return False
        return True
    except subprocess.CalledProcessError as e:
        st.error(f"Critical error running pickleface.py: {e.stderr}")
        return False

def initialize_face_detection_results():
    # Kiểm tra xem tất cả file pkl đã tồn tại chưa
    missing_files = [opt for opt in options if not os.path.exists(f'ref_videos/{opt}_face_det_result.pkl')]
    
    if missing_files:
        current_status_placeholder.write("Creating face detection results...")
        if not run_pickleface():
            st.error("Failed to create face detection results")
            st.stop()
        current_status_placeholder.write("Face detection results created successfully!")

# Cấu hình ban đầu
options = ['Aude', 'Kyla', 'Liv', 'MC6']
images = ['ref_videos/Aude.png', 'ref_videos/Kyla.png', 'ref_videos/Liv.png', 'ref_videos/MC6.png']

# Thêm đường dẫn đến thư mục Wav2Lip
wav2lip_path = os.path.join(os.path.dirname(__file__), "Wav2Lip")
if wav2lip_path not in sys.path:
    sys.path.insert(0, wav2lip_path)

# Giao diện
big_text = """
    <div style='text-align: center;'>
        <h1 style='font-size: 30x;'>Text to Speech Synchronized Video</h1>
    </div>
"""
st.markdown(big_text, unsafe_allow_html=True)
current_status_placeholder = st.empty()
init_progress_bar = st.progress(0)

# Khởi tạo session state
if 'is_initialized' not in st.session_state:
    initialize_face_detection_results()
    
    # Khởi tạo Avatar
    st.session_state.avatar = Avatar()
    st.session_state.avatar.export_video = False
    
    # Load model
    current_status_placeholder.write("Loading model...")
    st.session_state.avatar.load_model("checkpoint/wav2lip_gan.pth")
    current_status_placeholder.write("Model loaded successfully")
    
    # Cấu hình thiết bị
    st.session_state.avatar.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {st.session_state.avatar.device}")
    
    # Cấu hình đường dẫn
    st.session_state.avatar.output_audio_path = "audio/"
    st.session_state.avatar.output_audio_filename = "result.wav"
    st.session_state.avatar.temp_lip_video_no_voice_path = "temp/"
    st.session_state.avatar.temp_lip_video_no_voice_filename = "result.avi"
    st.session_state.avatar.output_video_path = "results/"
    st.session_state.avatar.output_video_name = "result_voice.mp4"
    
    # Khởi tạo video mặc định
    st.session_state.selected_option = "Liv"
    st.session_state.avatar.ref_video_path_and_filename = f"ref_videos/{st.session_state.selected_option}.mp4"
    
    # Xử lý video và face detection
    st.session_state.avatar.get_video_full_frames(st.session_state.avatar.ref_video_path_and_filename)
    st.session_state.avatar.face_detect_batch_size = 16
    
    # Load face detection results cho tất cả options
    st.session_state.face_det_results_dict = {}
    for option in options:
        with open(f'ref_videos/{option}_face_det_result.pkl', 'rb') as file:
            st.session_state.face_det_results_dict[option] = pickle.load(file)
    
    st.session_state.avatar.face_detect_img_results = st.session_state.face_det_results_dict[st.session_state.selected_option]
    st.session_state.avatar.face_det_results_path_and_name = f'ref_videos/{st.session_state.selected_option}_face_det_result.pkl'
    
    # Xử lý text to speech
    input_text = "Hi How are you?"
    st.session_state.avatar.text_to_lip_video(input_text, init_progress_bar)
    current_status_placeholder.write("Face detection results loaded")
    
    st.session_state['is_initialized'] = True

# Giao diện lựa chọn video
selected_option = st.radio("Choose an option:", options, index=options.index(st.session_state.selected_option))
img_col1, img_col2 = st.columns([1,1])

with img_col1:
    st.image(images[options.index(selected_option)])

# Xử lý khi thay đổi lựa chọn video
if st.session_state.selected_option != selected_option:
    print("The selected option has changed!")
    st.session_state.selected_option = selected_option
    st.session_state.avatar.ref_video_path_and_filename = f"ref_videos/{st.session_state.selected_option}.mp4"
    
    st.session_state.avatar.get_video_full_frames(st.session_state.avatar.ref_video_path_and_filename)
    st.session_state.avatar.face_detect_img_results = st.session_state.face_det_results_dict[st.session_state.selected_option]