frankiek3 commited on
Commit
551ad0a
1 Parent(s): 86ffc3f

Add Feature: Start Analysis at Time (s)

Browse files
Files changed (1) hide show
  1. app.py +12 -18
app.py CHANGED
@@ -12,11 +12,9 @@ os.system('pip install sentencepiece')
12
  os.system('pip install python_speech_features')
13
  os.system('pip install scikit-video')
14
  os.system('pip install transformers')
15
- os.system('pip install gradio==3.12')
16
  os.system('pip install numpy==1.23.3')
17
 
18
-
19
- # sys.path.append('/home/user/app/av_hubert')
20
  sys.path.append('/home/user/app/av_hubert/avhubert')
21
 
22
  print(sys.path)
@@ -25,7 +23,6 @@ print(sys.argv, type(sys.argv))
25
  sys.argv.append('dummy')
26
 
27
 
28
-
29
  import dlib, cv2, os
30
  import numpy as np
31
  import skvideo
@@ -44,8 +41,6 @@ from huggingface_hub import hf_hub_download
44
  import gradio as gr
45
  from pytube import YouTube
46
 
47
- # os.chdir('/home/user/app/av_hubert/avhubert')
48
-
49
  user_dir = "/home/user/app/av_hubert/avhubert"
50
  utils.import_user_module(Namespace(user_dir=user_dir))
51
  data_dir = "/home/user/app/video"
@@ -88,7 +83,7 @@ def detect_landmark(image, detector, predictor):
88
  coords[i] = (shape.part(i).x, shape.part(i).y)
89
  return coords
90
 
91
- def preprocess_video(input_video_path):
92
  if torch.cuda.is_available():
93
  detector = dlib.cnn_face_detection_model_v1(face_detector_path)
94
  else:
@@ -98,7 +93,7 @@ def preprocess_video(input_video_path):
98
  STD_SIZE = (256, 256)
99
  mean_face_landmarks = np.load(mean_face_path)
100
  stablePntsIDs = [33, 36, 39, 42, 45]
101
- videogen = skvideo.io.vread(input_video_path)
102
  frames = np.array([frame for frame in videogen])
103
  landmarks = []
104
  for frame in tqdm(frames):
@@ -141,6 +136,7 @@ def predict(process_video):
141
  # ---- Gradio Layout -----
142
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
143
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
 
144
  video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
145
  demo = gr.Blocks()
146
  demo.encrypt = False
@@ -161,7 +157,7 @@ with demo:
161
  with gr.Row():
162
  gr.Markdown('''
163
  ### Reading Lip movement with youtube link using Avhubert
164
- ##### Step 1a. Download video from youtube (Note: the length of video should be less than 10 seconds if not it will be cut and the face should be stable for better result)
165
  ##### Step 1b. You also can upload video directly
166
  ##### Step 2. Generating landmarks surrounding mouth area
167
  ##### Step 3. Reading lip movement.
@@ -176,21 +172,19 @@ with demo:
176
  "https://www.youtube.com/watch?v=80yqL2KzBVw"],
177
  label="Examples", inputs=[youtube_url_in])
178
  with gr.Column():
179
- youtube_url_in.render()
180
- download_youtube_btn = gr.Button("Download Youtube video")
181
- download_youtube_btn.click(get_youtube, [youtube_url_in], [
182
- video_in])
183
- print(video_in)
184
  with gr.Row():
185
  video_in.render()
186
  video_out.render()
187
  with gr.Row():
188
  detect_landmark_btn = gr.Button("Detect landmark")
189
- detect_landmark_btn.click(preprocess_video, [video_in], [
190
- video_out])
191
  predict_btn = gr.Button("Predict")
192
- predict_btn.click(predict, [video_out], [
193
- text_output])
194
  with gr.Row():
195
  # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
196
  text_output.render()
 
12
  os.system('pip install python_speech_features')
13
  os.system('pip install scikit-video')
14
  os.system('pip install transformers')
15
+ os.system('pip install gradio')
16
  os.system('pip install numpy==1.23.3')
17
 
 
 
18
  sys.path.append('/home/user/app/av_hubert/avhubert')
19
 
20
  print(sys.path)
 
23
  sys.argv.append('dummy')
24
 
25
 
 
26
  import dlib, cv2, os
27
  import numpy as np
28
  import skvideo
 
41
  import gradio as gr
42
  from pytube import YouTube
43
 
 
 
44
  user_dir = "/home/user/app/av_hubert/avhubert"
45
  utils.import_user_module(Namespace(user_dir=user_dir))
46
  data_dir = "/home/user/app/video"
 
83
  coords[i] = (shape.part(i).x, shape.part(i).y)
84
  return coords
85
 
86
+ def preprocess_video(input_video_path, input_start):
87
  if torch.cuda.is_available():
88
  detector = dlib.cnn_face_detection_model_v1(face_detector_path)
89
  else:
 
93
  STD_SIZE = (256, 256)
94
  mean_face_landmarks = np.load(mean_face_path)
95
  stablePntsIDs = [33, 36, 39, 42, 45]
96
+ videogen = skvideo.io.vread(input_video_path, inputdict={'-ss': str(input_start), '-t': '10'})
97
  frames = np.array([frame for frame in videogen])
98
  landmarks = []
99
  for frame in tqdm(frames):
 
136
  # ---- Gradio Layout -----
137
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
138
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
139
+ video_start_in = gr.Number(label="Start Time in Seconds", value=1, interactive=True)
140
  video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
141
  demo = gr.Blocks()
142
  demo.encrypt = False
 
157
  with gr.Row():
158
  gr.Markdown('''
159
  ### Reading Lip movement with youtube link using Avhubert
160
+ ##### Step 1a. Download video from youtube (Note: Only 10 seconds will be analyzed and the face should be stable for better result)
161
  ##### Step 1b. You also can upload video directly
162
  ##### Step 2. Generating landmarks surrounding mouth area
163
  ##### Step 3. Reading lip movement.
 
172
  "https://www.youtube.com/watch?v=80yqL2KzBVw"],
173
  label="Examples", inputs=[youtube_url_in])
174
  with gr.Column():
175
+ youtube_url_in.render()
176
+ video_start_in.render()
177
+ download_youtube_btn = gr.Button("Download Youtube video")
178
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [video_in])
179
+ print(video_in)
180
  with gr.Row():
181
  video_in.render()
182
  video_out.render()
183
  with gr.Row():
184
  detect_landmark_btn = gr.Button("Detect landmark")
185
+ detect_landmark_btn.click(preprocess_video, [video_in, video_start_in], [video_out])
 
186
  predict_btn = gr.Button("Predict")
187
+ predict_btn.click(predict, [video_out], [text_output])
 
188
  with gr.Row():
189
  # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
190
  text_output.render()