Lip_movement_reading

Runtime error

App Files Files Community

frankiek3 commited on Mar 2, 2024

Commit

551ad0a

verified ·

1 Parent(s): 86ffc3f

Add Feature: Start Analysis at Time (s)

Browse files

Files changed (1) hide show

app.py +12 -18

app.py CHANGED Viewed

@@ -12,11 +12,9 @@ os.system('pip install sentencepiece')
 os.system('pip install python_speech_features')
 os.system('pip install scikit-video')
 os.system('pip install transformers')
-os.system('pip install gradio==3.12')
 os.system('pip install numpy==1.23.3')
-# sys.path.append('/home/user/app/av_hubert')
 sys.path.append('/home/user/app/av_hubert/avhubert')
 print(sys.path)
@@ -25,7 +23,6 @@ print(sys.argv, type(sys.argv))
 sys.argv.append('dummy')
 import dlib, cv2, os
 import numpy as np
 import skvideo
@@ -44,8 +41,6 @@ from huggingface_hub import hf_hub_download
 import gradio as gr
 from pytube import YouTube
-# os.chdir('/home/user/app/av_hubert/avhubert')
 user_dir = "/home/user/app/av_hubert/avhubert"
 utils.import_user_module(Namespace(user_dir=user_dir))
 data_dir = "/home/user/app/video"
@@ -88,7 +83,7 @@ def detect_landmark(image, detector, predictor):
             coords[i] = (shape.part(i).x, shape.part(i).y)
     return coords
-def preprocess_video(input_video_path):
     if torch.cuda.is_available():
         detector = dlib.cnn_face_detection_model_v1(face_detector_path)
     else:
@@ -98,7 +93,7 @@ def preprocess_video(input_video_path):
     STD_SIZE = (256, 256)
     mean_face_landmarks = np.load(mean_face_path)
     stablePntsIDs = [33, 36, 39, 42, 45]
-    videogen = skvideo.io.vread(input_video_path)
     frames = np.array([frame for frame in videogen])
     landmarks = []
     for frame in tqdm(frames):
@@ -141,6 +136,7 @@ def predict(process_video):
 # ---- Gradio Layout -----
 youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
 video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
 demo = gr.Blocks()
 demo.encrypt = False
@@ -161,7 +157,7 @@ with demo:
     with gr.Row():
             gr.Markdown('''
             ### Reading Lip movement with youtube link using Avhubert
-            ##### Step 1a. Download video from youtube (Note: the length of video should be less than 10 seconds if not it will be cut and the face should be stable for better result)
             ##### Step 1b. You also can upload video directly
             ##### Step 2. Generating landmarks surrounding mouth area
             ##### Step 3. Reading lip movement.
@@ -176,21 +172,19 @@ with demo:
               "https://www.youtube.com/watch?v=80yqL2KzBVw"],
           label="Examples", inputs=[youtube_url_in])
     with gr.Column():
-          youtube_url_in.render()
-          download_youtube_btn = gr.Button("Download Youtube video")
-          download_youtube_btn.click(get_youtube, [youtube_url_in], [
-              video_in])
-          print(video_in)
     with gr.Row():
         video_in.render()
         video_out.render()
     with gr.Row():
         detect_landmark_btn = gr.Button("Detect landmark")
-        detect_landmark_btn.click(preprocess_video, [video_in], [
-            video_out])
         predict_btn = gr.Button("Predict")
-        predict_btn.click(predict, [video_out], [
-            text_output])
     with gr.Row():
         # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
         text_output.render()

 os.system('pip install python_speech_features')
 os.system('pip install scikit-video')
 os.system('pip install transformers')
+os.system('pip install gradio')
 os.system('pip install numpy==1.23.3')
 sys.path.append('/home/user/app/av_hubert/avhubert')
 print(sys.path)
 sys.argv.append('dummy')
 import dlib, cv2, os
 import numpy as np
 import skvideo
 import gradio as gr
 from pytube import YouTube
 user_dir = "/home/user/app/av_hubert/avhubert"
 utils.import_user_module(Namespace(user_dir=user_dir))
 data_dir = "/home/user/app/video"
             coords[i] = (shape.part(i).x, shape.part(i).y)
     return coords
+def preprocess_video(input_video_path, input_start):
     if torch.cuda.is_available():
         detector = dlib.cnn_face_detection_model_v1(face_detector_path)
     else:
     STD_SIZE = (256, 256)
     mean_face_landmarks = np.load(mean_face_path)
     stablePntsIDs = [33, 36, 39, 42, 45]
+    videogen = skvideo.io.vread(input_video_path, inputdict={'-ss': str(input_start), '-t': '10'})
     frames = np.array([frame for frame in videogen])
     landmarks = []
     for frame in tqdm(frames):
 # ---- Gradio Layout -----
 youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
+video_start_in = gr.Number(label="Start Time in Seconds", value=1, interactive=True)
 video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
 demo = gr.Blocks()
 demo.encrypt = False
     with gr.Row():
             gr.Markdown('''
             ### Reading Lip movement with youtube link using Avhubert
+            ##### Step 1a. Download video from youtube (Note: Only 10 seconds will be analyzed and the face should be stable for better result)
             ##### Step 1b. You also can upload video directly
             ##### Step 2. Generating landmarks surrounding mouth area
             ##### Step 3. Reading lip movement.
               "https://www.youtube.com/watch?v=80yqL2KzBVw"],
           label="Examples", inputs=[youtube_url_in])
     with gr.Column():
+        youtube_url_in.render()
+        video_start_in.render()
+        download_youtube_btn = gr.Button("Download Youtube video")
+        download_youtube_btn.click(get_youtube, [youtube_url_in], [video_in])
+        print(video_in)
     with gr.Row():
         video_in.render()
         video_out.render()
     with gr.Row():
         detect_landmark_btn = gr.Button("Detect landmark")
+        detect_landmark_btn.click(preprocess_video, [video_in, video_start_in], [video_out])
         predict_btn = gr.Button("Predict")
+        predict_btn.click(predict, [video_out], [text_output])
     with gr.Row():
         # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
         text_output.render()