Hrithik28 commited on
Commit
45c1c28
1 Parent(s): e9783bd

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,38 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ shape_predictor_68_face_landmarks.dat filter=lfs diff=lfs merge=lfs -text
36
+ demo1.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ demo2.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ lipreading.gif filter=lfs diff=lfs merge=lfs -text
20words_mean_face.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbf68b2044171e1160716df7c53e8bbfaa0ee8c61fb41171d04cb6092bb81422
3
+ size 1168
README.md CHANGED
@@ -1,13 +1,14 @@
1
- ---
2
- title: Lip Moment Reading
3
- emoji: 👁
4
- colorFrom: yellow
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 4.41.0
8
- app_file: app.py
9
- pinned: false
10
- license: unknown
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
+ ---
2
+ title: Speech Recognition from visual lip movement
3
+ emoji: 🫧
4
+ colorFrom: indigo
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 4.39.0
8
+ app_file: app.py
9
+ pinned: false
10
+ tags:
11
+ - making-demos
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+
5
+
6
+ os.system('git clone https://github.com/facebookresearch/av_hubert.git')
7
+ os.chdir('/home/user/app/av_hubert')
8
+ os.system('git submodule init')
9
+ os.system('git submodule update')
10
+ os.chdir('/home/user/app/av_hubert/fairseq')
11
+ os.system('pip install ./')
12
+ os.system('pip install scipy')
13
+ os.system('pip install sentencepiece')
14
+ os.system('pip install python_speech_features')
15
+ os.system('pip install scikit-video')
16
+ os.system('pip install transformers')
17
+ os.system('pip install gradio==3.12')
18
+ os.system('pip install numpy==1.23.3')
19
+
20
+
21
+ # sys.path.append('/home/user/app/av_hubert')
22
+ sys.path.append('/home/user/app/av_hubert/avhubert')
23
+
24
+ print(sys.path)
25
+ print(os.listdir())
26
+ print(sys.argv, type(sys.argv))
27
+ sys.argv.append('dummy')
28
+
29
+
30
+
31
+ import dlib, cv2, os
32
+ import numpy as np
33
+ import skvideo
34
+ import skvideo.io
35
+ from tqdm import tqdm
36
+ from preparation.align_mouth import landmarks_interpolate, crop_patch, write_video_ffmpeg
37
+ from base64 import b64encode
38
+ import torch
39
+ import cv2
40
+ import tempfile
41
+ from argparse import Namespace
42
+ import fairseq
43
+ from fairseq import checkpoint_utils, options, tasks, utils
44
+ from fairseq.dataclass.configs import GenerationConfig
45
+ from huggingface_hub import hf_hub_download
46
+ import gradio as gr
47
+ from pytube import YouTube
48
+
49
+ # os.chdir('/home/user/app/av_hubert/avhubert')
50
+
51
+ user_dir = "/home/user/app/av_hubert/avhubert"
52
+ utils.import_user_module(Namespace(user_dir=user_dir))
53
+ data_dir = "/home/user/app/video"
54
+
55
+ ckpt_path = hf_hub_download('vumichien/AV-HuBERT', 'model.pt')
56
+ face_detector_path = "/home/user/app/mmod_human_face_detector.dat"
57
+ face_predictor_path = "/home/user/app/shape_predictor_68_face_landmarks.dat"
58
+ mean_face_path = "/home/user/app/20words_mean_face.npy"
59
+ mouth_roi_path = "/home/user/app/roi.mp4"
60
+ modalities = ["video"]
61
+ gen_subset = "test"
62
+ gen_cfg = GenerationConfig(beam=20)
63
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
64
+ models = [model.eval().cuda() if torch.cuda.is_available() else model.eval() for model in models]
65
+ saved_cfg.task.modalities = modalities
66
+ saved_cfg.task.data = data_dir
67
+ saved_cfg.task.label_dir = data_dir
68
+ task = tasks.setup_task(saved_cfg.task)
69
+ generator = task.build_generator(models, gen_cfg)
70
+
71
+ def get_youtube(video_url):
72
+ yt = YouTube(video_url)
73
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
74
+ print("Success download video")
75
+ print(abs_video_path)
76
+ return abs_video_path
77
+
78
+ def detect_landmark(image, detector, predictor):
79
+ gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
80
+ face_locations = detector(gray, 1)
81
+ coords = None
82
+ for (_, face_location) in enumerate(face_locations):
83
+ if torch.cuda.is_available():
84
+ rect = face_location.rect
85
+ else:
86
+ rect = face_location
87
+ shape = predictor(gray, rect)
88
+ coords = np.zeros((68, 2), dtype=np.int32)
89
+ for i in range(0, 68):
90
+ coords[i] = (shape.part(i).x, shape.part(i).y)
91
+ return coords
92
+
93
+ # def predict_and_save(process_video):
94
+ # num_frames = int(cv2.VideoCapture(process_video).get(cv2.CAP_PROP_FRAME_COUNT))
95
+
96
+ # tsv_cont = ["/\n", f"test-0\t{process_video}\t{None}\t{num_frames}\t{int(16_000*num_frames/25)}\n"]
97
+ # label_cont = ["DUMMY\n"]
98
+ # with open(f"{data_dir}/test.tsv", "w") as fo:
99
+ # fo.write("".join(tsv_cont))
100
+ # with open(f"{data_dir}/test.wrd", "w") as fo:
101
+ # fo.write("".join(label_cont))
102
+ # task.load_dataset(gen_subset, task_cfg=saved_cfg.task)
103
+
104
+ # def decode_fn(x):
105
+ # dictionary = task.target_dictionary
106
+ # symbols_ignore = generator.symbols_to_strip_from_output
107
+ # symbols_ignore.add(dictionary.pad())
108
+ # return task.datasets[gen_subset].label_processors[0].decode(x, symbols_ignore)
109
+
110
+ # itr = task.get_batch_iterator(dataset=task.dataset(gen_subset)).next_epoch_itr(shuffle=False)
111
+ # sample = next(itr)
112
+ # if torch.cuda.is_available():
113
+ # sample = utils.move_to_cuda(sample)
114
+ # hypos = task.inference_step(generator, models, sample)
115
+ # ref = decode_fn(sample['target'][0].int().cpu())
116
+ # hypo = hypos[0][0]['tokens'].int().cpu()
117
+ # hypo = decode_fn(hypo)
118
+
119
+ # # Collect timestamps and texts
120
+ # transcript = []
121
+ # for i, (start, end) in enumerate(sample['net_input']['video_lengths'], 1):
122
+ # start_time = float(start) / 16_000
123
+ # end_time = float(end) / 16_000
124
+ # text = hypo[i].strip()
125
+ # transcript.append({"timestamp": [start_time, end_time], "text": text})
126
+
127
+ # # Save transcript to a JSON file
128
+ # with open('speech_transcript.json', 'w') as outfile:
129
+ # json.dump(transcript, outfile, indent=4)
130
+
131
+ # return hypo
132
+
133
+
134
+ def preprocess_video(input_video_path):
135
+ if torch.cuda.is_available():
136
+ detector = dlib.cnn_face_detection_model_v1(face_detector_path)
137
+ else:
138
+ detector = dlib.get_frontal_face_detector()
139
+
140
+ predictor = dlib.shape_predictor(face_predictor_path)
141
+ STD_SIZE = (256, 256)
142
+ mean_face_landmarks = np.load(mean_face_path)
143
+ stablePntsIDs = [33, 36, 39, 42, 45]
144
+ videogen = skvideo.io.vread(input_video_path)
145
+ frames = np.array([frame for frame in videogen])
146
+ landmarks = []
147
+ for frame in tqdm(frames):
148
+ landmark = detect_landmark(frame, detector, predictor)
149
+ landmarks.append(landmark)
150
+ preprocessed_landmarks = landmarks_interpolate(landmarks)
151
+ rois = crop_patch(input_video_path, preprocessed_landmarks, mean_face_landmarks, stablePntsIDs, STD_SIZE,
152
+ window_margin=12, start_idx=48, stop_idx=68, crop_height=96, crop_width=96)
153
+ write_video_ffmpeg(rois, mouth_roi_path, "/usr/bin/ffmpeg")
154
+ return mouth_roi_path
155
+
156
+ def predict(process_video):
157
+ num_frames = int(cv2.VideoCapture(process_video).get(cv2.CAP_PROP_FRAME_COUNT))
158
+
159
+ tsv_cont = ["/\n", f"test-0\t{process_video}\t{None}\t{num_frames}\t{int(16_000*num_frames/25)}\n"]
160
+ label_cont = ["DUMMY\n"]
161
+ with open(f"{data_dir}/test.tsv", "w") as fo:
162
+ fo.write("".join(tsv_cont))
163
+ with open(f"{data_dir}/test.wrd", "w") as fo:
164
+ fo.write("".join(label_cont))
165
+ task.load_dataset(gen_subset, task_cfg=saved_cfg.task)
166
+
167
+ def decode_fn(x):
168
+ dictionary = task.target_dictionary
169
+ symbols_ignore = generator.symbols_to_strip_from_output
170
+ symbols_ignore.add(dictionary.pad())
171
+ return task.datasets[gen_subset].label_processors[0].decode(x, symbols_ignore)
172
+
173
+ itr = task.get_batch_iterator(dataset=task.dataset(gen_subset)).next_epoch_itr(shuffle=False)
174
+ sample = next(itr)
175
+ if torch.cuda.is_available():
176
+ sample = utils.move_to_cuda(sample)
177
+ hypos = task.inference_step(generator, models, sample)
178
+ ref = decode_fn(sample['target'][0].int().cpu())
179
+ hypo = hypos[0][0]['tokens'].int().cpu()
180
+ hypo = decode_fn(hypo)
181
+ return hypo
182
+
183
+
184
+ # ---- Gradio Layout -----
185
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
186
+ video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
187
+ video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
188
+ demo = gr.Blocks()
189
+ demo.encrypt = False
190
+ text_output = gr.Textbox()
191
+
192
+ with demo:
193
+ gr.Markdown('''
194
+ <div>
195
+ <h1 style='text-align: center'>Lip Reading Using Machine learning (Audio-Visual Hidden Unit BERT Model (AV-HuBERT))</h1>
196
+ </div>
197
+ ''')
198
+ with gr.Row():
199
+ gr.Markdown('''
200
+ ### Reading Lip movement with youtube link using Avhubert
201
+ ##### Step 1a. Download video from youtube (Note: the length of video should be less than 10 seconds if not it will be cut and the face should be stable for better result)
202
+ ##### Step 1b. Drag and drop videos to upload directly
203
+ ##### Step 2. Generating landmarks surrounding mouth area
204
+ ##### Step 3. Reading lip movement.
205
+ ''')
206
+ with gr.Row():
207
+ gr.Markdown('''
208
+ ### You can test by following examples:
209
+ ''')
210
+ examples = gr.Examples(examples=
211
+ [ "https://www.youtube.com/watch?v=ZXVDnuepW2s",
212
+ "https://www.youtube.com/watch?v=X8_glJn1B8o",
213
+ "https://www.youtube.com/watch?v=80yqL2KzBVw"],
214
+ label="Examples", inputs=[youtube_url_in])
215
+ with gr.Column():
216
+ youtube_url_in.render()
217
+ download_youtube_btn = gr.Button("Download Youtube video")
218
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
219
+ video_in])
220
+ print(video_in)
221
+ with gr.Row():
222
+ video_in.render()
223
+ video_out.render()
224
+ with gr.Row():
225
+ detect_landmark_btn = gr.Button("Detect landmark")
226
+ detect_landmark_btn.click(preprocess_video, [video_in], [
227
+ video_out])
228
+ predict_btn = gr.Button("Predict")
229
+ #predict_btn.click(predict, [video_out], [text_output])
230
+ predict_btn.click(predict, [video_out], [text_output])
231
+ with gr.Row():
232
+ # video_lip = gr.Video(label="Audio Visual Video", mirror_webcam=False)
233
+ text_output.render()
234
+
235
+
236
+
237
+ demo.launch(debug=True)
mmod_human_face_detector.dat ADDED
Binary file (730 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/facebookresearch/fairseq.git
2
+ scipy
3
+ sentencepiece
4
+ python_speech_features
5
+ scikit-video
6
+ scikit-image
7
+ dlib
8
+ opencv-python
9
+ pytube
10
+ httpx==0.24.1
shape_predictor_68_face_landmarks.dat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbdc2cb80eb9aa7a758672cbfdda32ba6300efe9b6e6c7a299ff7e736b11b92f
3
+ size 99693937