File size: 1,892 Bytes
6e89871
 
 
 
 
6631114
6e89871
 
 
e3012f6
 
 
 
6e89871
 
 
 
 
 
 
6631114
 
6e89871
 
 
 
75ce42f
6e89871
 
 
 
 
 
 
 
 
75ce42f
 
6e89871
 
6631114
 
75ce42f
6631114
 
 
6e89871
6631114
 
6e89871
 
 
 
 
 
e3012f6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import face_recognition
import requests
import pandas as pd
from io import BytesIO
from tqdm import tqdm
from time import time


def get_image(url: str):
    headers = {
        "User-Agent": "Actors matching app 1.0"
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    img_file_object = BytesIO(response.content)
    return face_recognition.load_image_file(img_file_object)

def get_embeddings(url: str):
    try:
        image = get_image(url)
        embeddings = face_recognition.face_encodings(image, num_jitters=2, model="large")
        return list(embeddings[0])
    except Exception as e:
        print(e)

def process_all_images(input_file, output_file):
    df = pd.read_csv(input_file)[["nconst","contentUrl","resultPosition"]]
    
    try:
        df_emb = pd.read_csv(output_file)
        df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
    except: 
        # file does not exists yet
        df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])

    print(f"Start processing of {df.shape[0]} images")
    df = df.sort_values("resultPosition", ascending=True)
    #df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        embeddings = get_embeddings(row["contentUrl"])
        new_row = row.copy()
        new_row["embeddings"] = embeddings
        new_row = new_row[["nconst", "contentUrl", "embeddings"]]
        df_emb = df_emb.append(new_row, ignore_index=True)

        if i % 5 == 0:
            df_emb.to_csv(output_file, index=False)

    df_emb.to_csv(output_file, index=False)
    return df_emb

def build_annoy_index():
    pass

if __name__ == "__main__":
    output_file = "../data/actors_embeddings.csv"
    df_embeddings = process_all_images(input_file="../data/actors_images.csv", output_file=output_file)