import os import io import time import json import math import base64 from uuid import uuid4 from PIL import Image as PILImage os.environ["DEEPFACE_HOME"] = "." import pyzipper import numpy as np import gradio as gr from annoy import AnnoyIndex from deepface import DeepFace index = AnnoyIndex(512, "euclidean") index.load(f"face.db") ANNOY_INDEX = json.load(open(f"face.json")) with pyzipper.AESZipFile('persons.zip') as zf: password = os.getenv("VISAGE_KEY","").encode('ascii') zf.setpassword(password) PERFORMER_DB = json.loads(zf.read('performers.json')) ## Prediction functions def image_search_performer(image, threshold=20.0, results=3): """Search for a performer in an image Returns a list of performers with at least following keys: - id: the performer's id - distance: the distance between the face in the image and the performer's face - confidence: a confidence score between 0 and 100 - hits: the number of times the performer was found in our database """ image_array = np.array(image) face = DeepFace.represent(img_path = image_array, detector_backend='retinaface', model_name='Facenet512', normalization="Facenet2018")[0]['embedding'] return search_performer(face, threshold, results) def image_search_performers(image, threshold=20.0, results=3): image_array = np.array(image) response = [] t = time.time() try: faces = DeepFace.represent(img_path = image_array, detector_backend='retinaface', model_name='Facenet512', normalization="Facenet2018") # faces = DeepFace.represent(img_path = image_array, detector_backend='yolov8', model_name='Facenet512', normalization="Facenet2018") # faces = DeepFace.represent(img_path = image_array, detector_backend='mtcnn', model_name='Facenet512', normalization="Facenet2018") except ValueError as e: print(e) raise gr.Error("No faces found in the image") print(f"Time to find faces: {time.time() - t}") for face in faces: embedding = face['embedding'] area = face['facial_area'] confidence = face['face_confidence'] cimage = image.crop((area['x'], area['y'], area['x'] + area['w'], area['y'] + area['h'])) buf = io.BytesIO() cimage.save(buf, format='JPEG') im_b64 = base64.b64encode(buf.getvalue()).decode('ascii') response.append({ 'image': im_b64, 'confidence': confidence, 'performers': search_performer(embedding, threshold, results) }) return response def vector_search_performer(vector_json, threshold=20.0, results=3): """Search for a performer from a vector The vector should be created with Deepface and should be a 512 vector. For best results use the following settings: - detector_backend: retinaface - model: Facenet512 - normalization: Facenet2018 Returns a list of performers with at least following keys: - id: the performer's id - distance: the distance between the face in the image and the performer's face - confidence: a confidence score between 0 and 100 - hits: the number of times the performer was found in our database """ vector = np.array(json.loads(vector_json)) return search_performer(vector, threshold, results) def search_performer(vector, threshold=20.0, results=3): threshold = threshold or 20.0 results = results or 3 ids, distances = index.get_nns_by_vector( vector, 50, search_k=100000, include_distances=True ) persons = {} for p, distance in zip(ids, distances): id = ANNOY_INDEX[p] if id in persons: persons[id]["hits"] += 1 persons[id]["distance"] -= 0.5 persons[id]["confidence"] = normalize_confidence_from_distance(persons[id]["distance"], threshold) continue persons[id] = { "id": id, "distance": round(distance, 2), "confidence": normalize_confidence_from_distance(distance, threshold), "hits": 1, } if id in PERFORMER_DB: persons[id].update(PERFORMER_DB.get(id)) persons = sorted(persons.values(), key=lambda x: x["distance"]) # persons = [p for p in persons if p["distance"] < threshold] return persons[:results] def normalize_confidence_from_distance(distance, threshold=20.0): """Normalize confidence to 0-100 scale""" confidence = face_distance_to_conf(distance, threshold) return int(((confidence - 0.0) / (1.0 - 0.0)) * (100.0 - 0.0) + 0.0) def face_distance_to_conf(face_distance, face_match_threshold=20.0): """Using a face distance, calculate a similarity confidence value""" if face_distance > face_match_threshold: # The face is far away, so give it a low confidence range = (1.0 - face_match_threshold) linear_val = (1.0 - face_distance) / (range * 2.0) return linear_val else: # The face is close, so give it a high confidence range = face_match_threshold linear_val = 1.0 - (face_distance / (range * 2.0)) # But adjust this value by a curve so that we don't get a linear # transition from close to far. We want it to be more confident # the closer it is. return linear_val + ((1.0 - linear_val) * math.pow((linear_val - 0.5) * 2, 0.2)) def find_faces_in_sprite(image, vtt): vtt = base64.b64decode(vtt.replace("data:text/vtt;base64,", "")) sprite = PILImage.fromarray(image) results = [] for i, (left, top, right, bottom, time_seconds) in enumerate(getVTToffsets(vtt)): cut_frame = sprite.crop((left, top, left + right, top + bottom)) faces = DeepFace.extract_faces(np.asarray(cut_frame), detector_backend="mediapipe", enforce_detection=False, align=False) faces = [face for face in faces if face['confidence'] > 0.6] if faces: size = faces[0]['facial_area']['w'] * faces[0]['facial_area']['h'] data = {'id': str(uuid4()), "offset": (left, top, right, bottom), "frame": i, "time": time_seconds, 'size': size} results.append(data) return results def getVTToffsets(vtt): time_seconds = 0 left = top = right = bottom = None for line in vtt.decode("utf-8").split("\n"): line = line.strip() if "-->" in line: # grab the start time # 00:00:00.000 --> 00:00:41.000 start = line.split("-->")[0].strip().split(":") # convert to seconds time_seconds = ( int(start[0]) * 3600 + int(start[1]) * 60 + float(start[2]) ) left = top = right = bottom = None elif "xywh=" in line: left, top, right, bottom = line.split("xywh=")[-1].split(",") left, top, right, bottom = ( int(left), int(top), int(right), int(bottom), ) else: continue if not left: continue yield left, top, right, bottom, time_seconds image_search = gr.Interface( fn=image_search_performer, inputs=[ gr.components.Image(), gr.components.Slider(label="threshold",minimum=0.0, maximum=30.0, value=20.0), gr.components.Slider(label="results", minimum=0, maximum=50, value=3, step=1), ], outputs=gr.outputs.JSON(label=""), title="Who is in the photo?", description="Upload an image of a person and we'll tell you who it is.", ) image_search_multiple = gr.Interface( fn=image_search_performers, inputs=[ gr.components.Image(type='pil'), gr.components.Slider(label="threshold",minimum=0.0, maximum=30.0, value=20.0), gr.components.Slider(label="results", minimum=0, maximum=50, value=3, step=1), ], outputs=gr.outputs.JSON(label=""), title="Who is in the photo?", description="Upload an image of a person(s) and we'll tell you who it is.", ) vector_search = gr.Interface( fn=vector_search_performer, inputs=[ gr.components.Textbox(), gr.components.Slider(label="threshold",minimum=0.0, maximum=30.0, value=20.0), gr.components.Slider(label="results", minimum=0, maximum=50, value=3, step=1), ], outputs=gr.outputs.JSON(label=""), title="Who is in the photo?", description="512 vector created with deepface of a person and we'll tell you who it is.", ) faces_in_sprite = gr.Interface( fn=find_faces_in_sprite, inputs=[ gr.Image(), gr.Textbox(label="VTT file") ], outputs=gr.JSON(label=""), ) gr.TabbedInterface([image_search, image_search_multiple, vector_search, faces_in_sprite]).queue().launch(enable_queue=True, server_name="0.0.0.0")