from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import datasets
import gradio as gr
import numpy as np
import torchvision.transforms as transforms
import mediapipe as mp
import cv2

#model = SentenceTransformer('clip-ViT-B-16')
model = SentenceTransformer('clip-ViT-B-32')
model2 = SentenceTransformer('clip-ViT-B-16')
model3 = SentenceTransformer('clip-ViT-L-14')

dataset = datasets.load_dataset('brendenc/celeb-identities')
    
def predict(im1, im2):
    # Convert the PIL Image to a numpy array
    im1 = np.array(im1)
    im2 = np.array(im2)
    face1 = im1.copy()
    face2 = im2.copy()

    img1_h, img1_w, _ = im1.shape
    img2_h, img2_w, _ = im2.shape

    # Locate face using mediapipe
    mp_face_mesh = mp.solutions.face_mesh

    with mp_face_mesh.FaceMesh(max_num_faces=1, refine_landmarks=True, min_detection_confidence=0.5, min_tracking_confidence=0.5) as face_mesh:
        results1 = face_mesh.process(im1)
        results2 = face_mesh.process(im2)

        if results1.multi_face_landmarks:
            for face_landmarks in results1.multi_face_landmarks:
                # get location of face detected
                top_x = int(face_landmarks.landmark[234].x * img1_w)
                top_y = int(face_landmarks.landmark[10].y * img1_h)
                bottom_x = int(face_landmarks.landmark[454].x * img1_w)
                bottom_y = int(face_landmarks.landmark[152].y * img1_h)
                
                face1 = im1[top_y:bottom_y, top_x:bottom_x]
                cv2.rectangle(im1, (top_x, top_y), (bottom_x, bottom_y), (0, 255, 0), 2)

        if results2.multi_face_landmarks:
            for face_landmarks in results2.multi_face_landmarks:
                # get location of face detected
                top_x = int(face_landmarks.landmark[234].x * img2_w)
                top_y = int(face_landmarks.landmark[10].y * img2_h)
                bottom_x = int(face_landmarks.landmark[454].x * img2_w)
                bottom_y = int(face_landmarks.landmark[152].y * img2_h)
                
                face2 = im2[top_y:bottom_y, top_x:bottom_x]
                cv2.rectangle(im2, (top_x, top_y), (bottom_x, bottom_y), (0, 255, 0), 2)

    # Convert the tensor back to a PIL Image
    face1 = transforms.ToPILImage()(face1)
    im1 = transforms.ToPILImage()(im1)
    face2 = transforms.ToPILImage()(face2)
    im2 = transforms.ToPILImage()(im2)
    
  
    embeddings = model.encode([face1, face2])
    embeddings2 = model2.encode([face1, face2])
    embeddings3 = model3.encode([face1, face2])
    
    sim = cosine_similarity(embeddings)
    sim2 = cosine_similarity(embeddings2)
    sim3 = cosine_similarity(embeddings3)
    
    sim = sim[0, 1]
    sim2 = sim2[0, 1]
    sim3 = sim3[0, 1]
    
    if sim > 0.82:
        return im1, im2, sim, sim2, sim3, "SAME PERSON, AUTHORIZE PAYMENT"
    else:
        return im1, im2, sim, sim2, sim3, "DIFFERENT PEOPLE, DON'T AUTHORIZE PAYMENT"


interface = gr.Interface(fn=predict, 
                         inputs= [gr.Image(value = dataset['train']['image'][10], type="pil", source="webcam"), 
                                  gr.Image(value = dataset['train']['image'][17], type="pil", source="webcam")], 
                         outputs= [gr.Image(),
                                   gr.Image(),
								   gr.Number(label="Similarity"),
                                   gr.Number(label="Similarity_b16"),
                                   gr.Number(label="Similarity_l14"),
                                   gr.Textbox(label="Message")],
                         title = 'Face ID',
                         description = 'This app uses face biometrics and a similarity to function as a Face ID application.The similarity score ranges from -1 to 1.'
                         )

interface.launch(debug=True)
#interface.launch(share=True)