Spaces:
Sleeping
Sleeping
| from enum import Enum | |
| import numpy as np | |
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| from transformers import DPTImageProcessor, DPTForDepthEstimation | |
| from typing import List, Tuple | |
| import random | |
| from PIL import ImageDraw, ImageFont | |
| from gradio.components import Image as grImage | |
| import mediapipe as mp | |
| processor = DPTImageProcessor.from_pretrained("Intel/dpt-large") | |
| model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") | |
| detector = mp.solutions.face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.5) | |
| class Placement(Enum): | |
| CENTER = 0 | |
| TOP = 1 | |
| class FaceKeypointsLabel(Enum): | |
| OTHER = 0 | |
| NOSE = 1 | |
| class Keypoints: | |
| def __init__(self, x: float, y: float, label: FaceKeypointsLabel): | |
| """ | |
| :param x: x coordinate of the keypoint, normalized between 0 and 1 | |
| :param y: y coordinate of the keypoint, normalized between 0 and 1 | |
| """ | |
| self.x = x | |
| self.y = y | |
| self.label = label | |
| class BoundingBox: | |
| def __init__(self, x_min: int, y_min: int, width: int, height: int): | |
| self.x_min = x_min | |
| self.y_min = y_min | |
| self.width = width | |
| self.height = height | |
| class FaceDetectionResult: | |
| """ | |
| A class to represent the result of a face detection | |
| """ | |
| def __init__(self, bounding_box : BoundingBox, keypoints: List[Keypoints]): | |
| self.bounding_box = bounding_box | |
| self.keypoints = keypoints | |
| def detect_face(image: Image) -> List[any]: | |
| """ | |
| Use mediapipe to detect faces in an image | |
| """ | |
| result = detector.process(np.array(image)) | |
| if result.detections is None: | |
| return [] | |
| return result.detections | |
| def predict_depth(image: Image) -> np.ndarray: | |
| """ | |
| Predict depth for an image | |
| """ | |
| inputs = processor(images=image, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predicted_depth = outputs.predicted_depth | |
| # Interpolate to original size | |
| prediction = torch.nn.functional.interpolate( | |
| predicted_depth.unsqueeze(1), | |
| size=image.size[::-1], | |
| mode="bicubic", | |
| align_corners=False, | |
| ) | |
| output = prediction.squeeze().cpu().numpy() | |
| return (output * 255 / np.max(output)).astype("uint8") | |
| def estimate_depth_at_points(depth_map: np.ndarray, coordinates: List[Tuple[int, int]]) -> List[float]: | |
| """ | |
| Get the depth at a given coordinates | |
| """ | |
| depth_estimates = [] | |
| # Iterate through the given coordinates and estimate depth at each point | |
| for x, y in coordinates: | |
| depth_estimate = depth_map[y, x] # Access depth at the given point | |
| depth_estimates.append(depth_estimate) | |
| return depth_estimates | |
| class Person: | |
| """ | |
| A class to represent a person in an image | |
| """ | |
| def __init__(self, nose_x: int, nose_y: int, head_width: int, head_height: int, middle_top_head_x: int, middle_top_head_y: int): | |
| self.nose_x = nose_x | |
| self.nose_y = nose_y | |
| self.head_width = head_width | |
| self.head_height = head_height | |
| self.middle_top_head_x = middle_top_head_x | |
| self.middle_top_head_y = middle_top_head_y | |
| self.nose_width = int(head_width / 5) | |
| self.nose_height = int(head_height / 3) | |
| def extract_persons(face_detection_results: List[FaceDetectionResult], image: Image) -> List[Person]: | |
| """ | |
| Extract a list of people from a face detection result | |
| """ | |
| persons = [] | |
| for face_result in face_detection_results: | |
| bbox = face_result.bounding_box | |
| keypoints = face_result.keypoints | |
| # Assuming the nose is the first keypoint in the list. | |
| # You might need to adjust this based on how keypoints are ordered. | |
| for keypoint in keypoints: | |
| if keypoint.label == FaceKeypointsLabel.NOSE: | |
| nose_keypoint = keypoint | |
| break | |
| nose_x = int(nose_keypoint.x * image.width) | |
| nose_y = int(nose_keypoint.y * image.height) | |
| # Bounding box details | |
| middle_top_head_x = int(bbox.x_min + bbox.width // 2) | |
| middle_top_head_y = bbox.y_min | |
| head_width = bbox.width | |
| head_height = bbox.height | |
| # Create and add Person object | |
| person = Person(nose_x, nose_y, head_width, head_height, middle_top_head_x, middle_top_head_y) | |
| persons.append(person) | |
| return persons | |
| def add_mask(image: Image, mask: Image, coordinate: Tuple[int, int], size: Tuple[int, int], placement: Placement) -> Image: | |
| """ | |
| Add a mask (a static image) to an image | |
| """ | |
| # maintain aspect ratio | |
| if len(size) == 1: | |
| height = mask.height | |
| width = mask.width | |
| ratio = height / width | |
| size = (size[0], int(size[0] * ratio)) | |
| if placement == Placement.CENTER: | |
| coordinate = (coordinate[0] - size[0] // 2, coordinate[1] - size[1] // 2) | |
| elif placement == Placement.TOP: | |
| coordinate = (coordinate[0] - size[0] // 2, coordinate[1] - size[1]) | |
| mask = mask.resize(size) | |
| image.paste(mask, coordinate, mask) | |
| return image | |
| def draw_attributes(image: Image, persons: List[Person]) -> Image: | |
| """ | |
| Debug function to the face recognition attributes on an image | |
| """ | |
| draw = ImageDraw.Draw(image) | |
| font = ImageFont.load_default() | |
| for person in persons: | |
| # Draw a circle at the nose position | |
| draw.ellipse([(person.nose_x - 5, person.nose_y - 5), (person.nose_x + 5, person.nose_y + 5)], fill=(0, 255, 0)) | |
| # Draw the head rectangle | |
| draw.rectangle([(person.middle_top_head_x - person.head_width // 2, person.middle_top_head_y), | |
| (person.middle_top_head_x + person.head_width // 2, person.middle_top_head_y + person.head_height)], | |
| outline=(0, 255, 0)) | |
| # Put text for dimensions | |
| draw.text((person.middle_top_head_x, person.middle_top_head_y - 20), f"Width: {person.head_width}, Height: {person.head_height}", fill=(255, 255, 255), font=font) | |
| # put location of nose | |
| draw.text((person.nose_x, person.nose_y + 10), f"({person.nose_x}, {person.nose_y})", fill=(255, 255, 255), font=font) | |
| # draw dot at middle top head | |
| draw.ellipse([(person.middle_top_head_x - 5, person.middle_top_head_y - 5), (person.middle_top_head_x + 5, person.middle_top_head_y + 5)], fill=(255, 0, 0)) | |
| return image | |
| def apply_reindeer_mask(image: Image, person: Person) -> Image: | |
| """ | |
| Apply a reindeer mask to a person in an image | |
| """ | |
| reindeer_nose = Image.open("mask/reindeer_nose.png") | |
| reindeer_antlers = Image.open("mask/reindeer_antlers.png") | |
| reindeer_nose_coordinate = (person.nose_x, person.nose_y) | |
| reindeer_nose_size = (person.nose_height, person.nose_height) | |
| image = add_mask(image, reindeer_nose, reindeer_nose_coordinate, reindeer_nose_size, Placement.CENTER) | |
| reindeer_antlers_size = (person.head_width, ) | |
| reindeer_antlers_coordinate = (person.middle_top_head_x, person.middle_top_head_y) | |
| image = add_mask(image, reindeer_antlers, reindeer_antlers_coordinate, reindeer_antlers_size, Placement.TOP) | |
| return image | |
| def apply_santa_hat_mask(image: Image, person: Person) -> Image: | |
| """ | |
| Apply a santa hat mask to a person in an image | |
| """ | |
| santa_hat = Image.open("mask/santa_hat.png") | |
| santa_hat_size = (person.head_width, ) | |
| santa_hat_coordinate = (person.middle_top_head_x, person.middle_top_head_y) | |
| image = add_mask(image, santa_hat, santa_hat_coordinate, santa_hat_size, Placement.TOP) | |
| return image | |
| def add_text(image: Image, text: str, font_size: int = 30) -> Image: | |
| """ | |
| Add text to an image | |
| """ | |
| draw = ImageDraw.Draw(image) | |
| text_x = image.width // 2 | |
| text_y = image.height // 2 | |
| draw.text((text_x, text_y), text, fill=(255, 0, 0)) | |
| return image | |
| def apply_random_mask(image: Image, person: Person) -> Image: | |
| """ | |
| Apply a random mask to a person in an image | |
| """ | |
| mask = random.choice([apply_santa_hat_mask, apply_reindeer_mask]) | |
| image = mask(image, person) | |
| return image | |
| def process_image(image : Image): | |
| """ | |
| The full pipeline that take an image and returns an image with more christmas spirit :) | |
| """ | |
| # Potential improvement this could be done in parallel | |
| depth_result = predict_depth(image) | |
| detections = detect_face(image) | |
| face_detection_results = parse_detection_result(detections, image) | |
| persons = extract_persons(face_detection_results, image) | |
| if len(persons) == 0: | |
| return add_text(image, "No faces detected in the image") | |
| if len(persons) == 1: | |
| image = apply_random_mask(image,persons[0]) | |
| elif len(persons) > 1: | |
| # Apply the rules of the assignment, closest person gets santa hat, furthest person gets reindeer mask | |
| # All other people get a random mask (either santa hat or reindeer mask) (as this was not specified in the assignment) | |
| depth_estimates = estimate_depth_at_points(depth_result, [(person.nose_x, person.nose_y) for person in persons]) | |
| closest_camera_index = np.argmin(depth_estimates) | |
| furthest_camera_index = np.argmax(depth_estimates) | |
| santa_person = persons[closest_camera_index] | |
| reindeer_person = persons[furthest_camera_index] | |
| image = apply_reindeer_mask(image, reindeer_person) | |
| image = apply_santa_hat_mask(image, santa_person) | |
| for i, person in enumerate(persons): | |
| if i != closest_camera_index and i != furthest_camera_index: | |
| image = apply_random_mask(image, person) | |
| return image | |
| def parse_detection_to_face_detection_result(detection, image_width: int, image_height: int) -> FaceDetectionResult: | |
| """ | |
| Parse a mediapipe detection to a FaceDetectionResult | |
| """ | |
| # Extract bounding box | |
| bbox = detection.location_data.relative_bounding_box | |
| x_min = int(bbox.xmin * image_width) | |
| y_min = int(bbox.ymin * image_height) | |
| width = int(bbox.width * image_width) | |
| height = int(bbox.height * image_height) | |
| bounding_box = BoundingBox(x_min, y_min, width, height) | |
| # Extract keypoints | |
| keypoints = [] | |
| for i, keypoint in enumerate(detection.location_data.relative_keypoints): | |
| x = keypoint.x | |
| y = keypoint.y | |
| face_type = FaceKeypointsLabel.OTHER | |
| if i == 2: | |
| face_type = FaceKeypointsLabel.NOSE | |
| keypoints.append(Keypoints(x, y, face_type)) | |
| return FaceDetectionResult(bounding_box, keypoints) | |
| def parse_detection_result(detection_result, image: Image) -> List[FaceDetectionResult]: | |
| """ | |
| Parse a mediapipe detection result to a list of FaceDetectionResult | |
| """ | |
| face_detection_results = [] | |
| for detection in detection_result: | |
| face_detection_result = parse_detection_to_face_detection_result(detection, image.width, image.height) | |
| face_detection_results.append(face_detection_result) | |
| return face_detection_results | |
| def main(): | |
| # Remarks: the code is in one file for simplicity, but it would be better to split it up in multiple files | |
| # Create a gradio interface | |
| iface = gr.Interface( | |
| fn=process_image, | |
| inputs=grImage(type="pil"), | |
| outputs=grImage(type="pil"), | |
| title="Image Processor", | |
| description="Upload an image to detect faces and apply transformations." | |
| ) | |
| # Launch the interface | |
| iface.launch() | |
| if __name__ == "__main__": | |
| main() | |