wissemkarous commited on
Commit
479504c
1 Parent(s): 52e768e

extraction

Browse files
lips_coords_extractor.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import dlib
3
+ import json
4
+ import glob
5
+ import os
6
+ from multiprocessing import Pool
7
+
8
+ LIP_COORDINATES_DIRECTORY = "lip_coordinates"
9
+ ERROR_DIRECTORY = "error_videos"
10
+
11
+ # path to the original GRID dataset whose videos are converted to frames
12
+ GRID_IMAGES_DIRECTORY = "lip/GRID_imgs"
13
+ train_unseen_list = "data/unseen_val.txt"
14
+ train_overlap_list = "data/overlap_train.txt"
15
+ test_unseen_list = "data/unseen_val.txt"
16
+ test_overlap_list = "data/overlap_val.txt"
17
+
18
+
19
+ def load_data_list(data_path, dictionary):
20
+ with open(data_path, "r") as f:
21
+ for line in f.readlines():
22
+ line = line.strip()
23
+ speaker = line.split("/")[-4]
24
+ vid = line.split("/")[-1]
25
+ dictionary[f"{speaker}/{vid}"] = 1
26
+ return dictionary
27
+
28
+
29
+ def extract_lip_coordinates(detector, predictor, img_path):
30
+ # used to preprocess the original image frames in the GRID dataset to extract the lip coordinates
31
+ image = cv2.imread(img_path)
32
+ image = cv2.resize(image, (600, 500))
33
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
34
+
35
+ rects = detector(gray)
36
+ assert len(rects) == 1
37
+ for rect in rects:
38
+ # extract the coordinates of the bounding box
39
+ x1 = rect.left()
40
+ y1 = rect.top()
41
+ x2 = rect.right()
42
+ y2 = rect.bottom()
43
+
44
+ # apply the shape predictor to the face ROI
45
+ shape = predictor(gray, rect)
46
+ x = []
47
+ y = []
48
+ for n in range(48, 68):
49
+ x.append(shape.part(n).x)
50
+ y.append(shape.part(n).y)
51
+ return [x, y]
52
+
53
+
54
+ def log_error_video(video_path):
55
+ print("Error: ", video_path)
56
+ with open(ERROR_DIRECTORY + "/error_videos.txt", "a") as f:
57
+ f.write(video_path + "\n")
58
+
59
+
60
+ data_dict = {}
61
+ data_dict = load_data_list(train_unseen_list, data_dict)
62
+ data_dict = load_data_list(train_overlap_list, data_dict)
63
+ data_dict = load_data_list(test_unseen_list, data_dict)
64
+ data_dict = load_data_list(test_overlap_list, data_dict)
65
+
66
+
67
+ speakers = glob.glob(GRID_IMAGES_DIRECTORY + "/*")
68
+ print(speakers[0])
69
+
70
+
71
+ def generate_lip_coordinates(speakers):
72
+ file_path_sep = "\\"
73
+ detector = dlib.get_frontal_face_detector()
74
+ predictor = dlib.shape_predictor(
75
+ "lip_coordinate_extraction/shape_predictor_68_face_landmarks_GTX.dat"
76
+ )
77
+ for speaker in speakers:
78
+ print(speaker)
79
+ videos = glob.glob(speaker + "/*")
80
+ for video in videos:
81
+ print(video)
82
+ frames = glob.glob(video + "/*.jpg")
83
+ if len(frames) < 50: # filter out bad videos
84
+ continue
85
+ vid = {}
86
+ try:
87
+ frames = sorted(
88
+ frames,
89
+ key=lambda x: int(x.split(file_path_sep)[-1].split(".")[0]),
90
+ )
91
+ for frame in frames:
92
+ retry = 3
93
+ while retry > 0:
94
+ try:
95
+ coords = extract_lip_coordinates(detector, predictor, frame)
96
+ break
97
+ except Exception as e:
98
+ retry -= 1
99
+ print("Error: ", video)
100
+ print(e)
101
+ print("retrying...")
102
+
103
+ vid[frame.split(file_path_sep)[-1].split(".")[0]] = coords
104
+ vid_path = video.split(file_path_sep)
105
+ save_path = (
106
+ LIP_COORDINATES_DIRECTORY
107
+ + "/"
108
+ + vid_path[-2]
109
+ + "/"
110
+ + vid_path[-1]
111
+ + ".json"
112
+ )
113
+
114
+ if not os.path.exists(LIP_COORDINATES_DIRECTORY + "/" + vid_path[-2]):
115
+ os.makedirs(LIP_COORDINATES_DIRECTORY + "/" + vid_path[-2])
116
+
117
+ with open(
118
+ save_path,
119
+ "w",
120
+ ) as f:
121
+ json.dump(vid, f)
122
+ except Exception as e:
123
+ print(e)
124
+ log_error_video(video)
125
+
126
+
127
+ def generate_lip_coordinates(speakers):
128
+ file_path_sep = "\\"
129
+ detector = dlib.get_frontal_face_detector()
130
+ predictor = dlib.shape_predictor(
131
+ "lip_coordinate_extraction/shape_predictor_68_face_landmarks_GTX.dat"
132
+ )
133
+ for speaker in speakers:
134
+ print(speaker)
135
+ videos = glob.glob(speaker + "/*")
136
+ for video in videos:
137
+ # if (
138
+ # video.split(file_path_sep)[-2] + "/" + video.split(file_path_sep)[-1]
139
+ # not in data_dict
140
+ # ):
141
+ # continue
142
+ print(video)
143
+ frames = glob.glob(video + "/*.jpg")
144
+ if len(frames) < 50: # filter out bad videos
145
+ continue
146
+ vid = {}
147
+ try:
148
+ frames = sorted(
149
+ frames,
150
+ key=lambda x: int(x.split(file_path_sep)[-1].split(".")[0]),
151
+ )
152
+ for frame in frames:
153
+ retry = 3
154
+ while retry > 0:
155
+ try:
156
+ coords = extract_lip_coordinates(detector, predictor, frame)
157
+ break
158
+ except Exception as e:
159
+ retry -= 1
160
+ print("Error: ", video)
161
+ print(e)
162
+ print("retrying...")
163
+
164
+ vid[frame.split(file_path_sep)[-1].split(".")[0]] = coords
165
+ vid_path = video.split(file_path_sep)
166
+ save_path = (
167
+ LIP_COORDINATES_DIRECTORY
168
+ + "/"
169
+ + vid_path[-2]
170
+ + "/"
171
+ + vid_path[-1]
172
+ + ".json"
173
+ )
174
+
175
+ if not os.path.exists(LIP_COORDINATES_DIRECTORY + "/" + vid_path[-2]):
176
+ os.makedirs(LIP_COORDINATES_DIRECTORY + "/" + vid_path[-2])
177
+
178
+ with open(
179
+ save_path,
180
+ "w",
181
+ ) as f:
182
+ json.dump(vid, f)
183
+ except Exception as e:
184
+ print(e)
185
+ log_error_video(video)
186
+
187
+
188
+ num_processes = 8
189
+
190
+ speaker_groups = []
191
+ speaker_interval = len(speakers) // num_processes
192
+ for i in range(num_processes):
193
+ if i == 4:
194
+ speaker_groups.append(speakers[i * speaker_interval :])
195
+ else:
196
+ speaker_groups.append(
197
+ speakers[i * speaker_interval : (i + 1) * speaker_interval]
198
+ )
199
+
200
+
201
+ if __name__ == "__main__":
202
+ with Pool(num_processes) as p:
203
+ p.map(generate_lip_coordinates, speaker_groups)
shape_predictor_68_face_landmarks_GTX.dat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:249a69a1d5f2d7c714a92934d35367d46eb52dc308d46717e82d49e8386b3b80
3
+ size 66435981