from sentence_transformers import util from transformers import pipeline from PIL import Image, ImageDraw from sentence_transformers import util,SentenceTransformer checkpoint = "google/owlvit-base-patch32" detector = pipeline(model=checkpoint, task="zero-shot-object-detection") model = SentenceTransformer('clip-ViT-L-14') def get_face_image(im1): predictions = detector( im1, candidate_labels=["human face"], ) max_score = 0 box_area = None for prediction in predictions: box = prediction["box"] label = prediction["label"] score = prediction["score"] if score > max_score : xmin, ymin, xmax, ymax = box.values() box_area = (xmin, ymin, xmax, ymax) max_score = score else: continue draw = ImageDraw.Draw(im1) draw.rectangle(box_area, outline="red", width=1) #draw.text((xmin, ymin), f"{label}: {round(score,2)}", fill="blue") crop_img1 = im1.crop(box_area) #display(crop_img1) newsize = (256, 256) face_img1 = crop_img1.resize(newsize) #display(face_img1) return face_img1 def predict(im1, im2,inp_sim): face_image1 = get_face_image(im1) face_image2 = get_face_image(im2) img_emb = model.encode([face_image1, face_image2]) sim = util.cos_sim(img_emb[0], img_emb[1]) if sim > inp_sim: return sim, "SAME PERSON, UNLOCK PHONE" else: return sim, "DIFFERENT PEOPLE, DON'T UNLOCK"