yubo0306 commited on
Commit
6a82db8
·
verified ·
1 Parent(s): aabc730

Upload feature extractor

Browse files
feature_extraction_avhubert.py CHANGED
@@ -1,16 +1,28 @@
 
 
1
  import cv2
2
  import librosa
3
  import mediapipe as mp
4
  import numpy as np
 
5
  import torch
6
  import torch.nn.functional as F
7
  import torchvision.transforms.v2 as transforms
8
  from numpy.typing import NDArray
 
9
  from python_speech_features import logfbank
10
  from transformers import FeatureExtractionMixin
11
  from transformers.feature_extraction_utils import BatchFeature
12
 
13
- mp_face_mesh = mp.solutions.face_mesh
 
 
 
 
 
 
 
 
14
 
15
 
16
  class AVHubertFeatureExtractor(FeatureExtractionMixin):
@@ -72,13 +84,67 @@ class AVHubertFeatureExtractor(FeatureExtractionMixin):
72
  frames_np = np.stack([cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) for frame in frames_np], axis=0)
73
 
74
  if extract_mouth:
75
- frames_np = self._extract_mouth(frames_np)
76
 
77
  return torch.from_numpy(frames_np).unsqueeze(dim=1)
78
 
79
  def _extract_mouth(self, frames: NDArray[np.uint8]) -> NDArray[np.uint8]:
80
  mouth_frames = []
81
  top_idx, right_idx, bottom_idx, left_idx = self.landmark_indices
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  with mp_face_mesh.FaceMesh(
83
  static_image_mode=self.static_image_mode,
84
  max_num_faces=1,
 
1
+ from pathlib import Path
2
+
3
  import cv2
4
  import librosa
5
  import mediapipe as mp
6
  import numpy as np
7
+ import requests
8
  import torch
9
  import torch.nn.functional as F
10
  import torchvision.transforms.v2 as transforms
11
  from numpy.typing import NDArray
12
+ from packaging.version import Version
13
  from python_speech_features import logfbank
14
  from transformers import FeatureExtractionMixin
15
  from transformers.feature_extraction_utils import BatchFeature
16
 
17
+ use_legacy_mp = False
18
+ if Version(mp.__version__) <= Version("0.10.21"):
19
+ mp_face_mesh = mp.solutions.face_mesh
20
+ use_legacy_mp = True
21
+ else:
22
+ BaseOptions = mp.tasks.BaseOptions
23
+ FaceLandmarker = mp.tasks.vision.FaceLandmarker
24
+ FaceLandmarkerOptions = mp.tasks.vision.FaceLandmarkerOptions
25
+ VisionRunningMode = mp.tasks.vision.RunningMode
26
 
27
 
28
  class AVHubertFeatureExtractor(FeatureExtractionMixin):
 
84
  frames_np = np.stack([cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) for frame in frames_np], axis=0)
85
 
86
  if extract_mouth:
87
+ frames_np = self._extract_mouth_legacy(frames_np) if use_legacy_mp else self._extract_mouth(frames_np)
88
 
89
  return torch.from_numpy(frames_np).unsqueeze(dim=1)
90
 
91
  def _extract_mouth(self, frames: NDArray[np.uint8]) -> NDArray[np.uint8]:
92
  mouth_frames = []
93
  top_idx, right_idx, bottom_idx, left_idx = self.landmark_indices
94
+
95
+ model_path = Path.home() / ".cache" / "reazonspeech" / "mediapipe---models--face_landmarker.task"
96
+ model_path.parent.mkdir(parents=True, exist_ok=True)
97
+ if not model_path.exists():
98
+ with open(model_path, "wb") as f:
99
+ f.write(requests.get("https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/latest/face_landmarker.task").content)
100
+ with FaceLandmarker.create_from_options(
101
+ FaceLandmarkerOptions(
102
+ base_options=BaseOptions(model_asset_path=model_path.as_posix()),
103
+ running_mode=VisionRunningMode.IMAGE,
104
+ num_faces=1,
105
+ min_face_detection_confidence=self.min_detection_confidence,
106
+ min_tracking_confidence=self.min_tracking_confidence,
107
+ )
108
+ ) as face_mesh:
109
+ for frame in frames:
110
+ res = face_mesh.detect(
111
+ mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
112
+ )
113
+ if res.face_landmarks is None or len(res.face_landmarks) == 0:
114
+ mouth_frames.append(np.zeros([self.image_crop_size, self.image_crop_size], dtype=np.uint8))
115
+ continue
116
+
117
+ landmarks = res.face_landmarks[0]
118
+ top = landmarks[top_idx]
119
+ left = landmarks[left_idx]
120
+ right = landmarks[right_idx]
121
+ bottom = landmarks[bottom_idx]
122
+
123
+ H, W = frame.shape[:2]
124
+ xmax = max(top.x, left.x, right.x, bottom.x)
125
+ ymax = max(top.y, left.y, right.y, bottom.y)
126
+ xmin = min(top.x, left.x, right.x, bottom.x)
127
+ ymin = min(top.y, left.y, right.y, bottom.y)
128
+
129
+ patch_size = max((xmax - xmin) * W, (ymax - ymin) * H) # To extract square region
130
+ half = int(patch_size / 2)
131
+ y_center = int(ymin * H) + int(((ymax - ymin) / 2) * H)
132
+ x_center = int(xmin * W) + int(((xmax - xmin) / 2) * W)
133
+ lip = frame[
134
+ y_center - half : y_center + half,
135
+ x_center - half : x_center + half,
136
+ :,
137
+ ]
138
+ try:
139
+ lip = cv2.resize(lip, (self.image_crop_size, self.image_crop_size))
140
+ except Exception:
141
+ lip = np.zeros([self.image_crop_size, self.image_crop_size, 3], dtype=np.uint8)
142
+ mouth_frames.append(cv2.cvtColor(lip, cv2.COLOR_RGB2GRAY))
143
+ return np.stack(mouth_frames, axis=0)
144
+
145
+ def _extract_mouth_legacy(self, frames: NDArray[np.uint8]) -> NDArray[np.uint8]:
146
+ mouth_frames = []
147
+ top_idx, right_idx, bottom_idx, left_idx = self.landmark_indices
148
  with mp_face_mesh.FaceMesh(
149
  static_image_mode=self.static_image_mode,
150
  max_num_faces=1,
preprocessor_config.json CHANGED
@@ -1,7 +1,6 @@
1
  {
2
  "auto_map": {
3
- "AutoFeatureExtractor": "feature_extraction_avhubert.AVHubertFeatureExtractor",
4
- "AutoProcessor": "processing_avhubert.AVHubertProcessor"
5
  },
6
  "feature_extractor_type": "AVHubertFeatureExtractor",
7
  "image_crop_size": 88,
@@ -15,7 +14,6 @@
15
  "min_detection_confidence": 0.5,
16
  "min_tracking_confidence": 0.5,
17
  "normalize": true,
18
- "processor_class": "AVHubertProcessor",
19
  "refine_landmarks": false,
20
  "sr": 16000,
21
  "stack_order_audio": 4,
 
1
  {
2
  "auto_map": {
3
+ "AutoFeatureExtractor": "feature_extraction_avhubert.AVHubertFeatureExtractor"
 
4
  },
5
  "feature_extractor_type": "AVHubertFeatureExtractor",
6
  "image_crop_size": 88,
 
14
  "min_detection_confidence": 0.5,
15
  "min_tracking_confidence": 0.5,
16
  "normalize": true,
 
17
  "refine_landmarks": false,
18
  "sr": 16000,
19
  "stack_order_audio": 4,