nbeuchat commited on
Commit
6e89871
1 Parent(s): 4aa2a91

scripts for downloading actors data and extract embeddings

Browse files
.env.example ADDED
@@ -0,0 +1 @@
 
1
+ BING_API_KEY=000000000000000000000000
.gitignore CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
  *.py[cod]
1
+ # data files from imdb
2
+ data/title.*.tsv*
3
+ data/name.*.tsv*
4
+
5
  # Byte-compiled / optimized / DLL files
6
  __pycache__/
7
  *.py[cod]
README.md CHANGED
@@ -1,2 +1,25 @@
1
- # demo_actor_matching
2
- Small demo to match an input image to a small database of recent actors based on their looks.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Actor matching demo
2
+
3
+ Who should play Hannibal (the Carthaginian, not the cannibal) if HBO ever adapts his story? How about you? Who should be your actor?
4
+ This application lets you input an image and see the top three actors that more closely resemble the image based on facial features.
5
+
6
+ Try it out on HugginFace _[Coming Soon]_
7
+
8
+
9
+ ## Data
10
+
11
+ The data comes from two sources:
12
+
13
+ 1. I built a list of relevant actors that have been in popular movies across their careers. The datasets that I used to build can be found on the [IMDB datasets page](https://datasets.imdbws.com/) (see instructions [here](https://www.imdb.com/interfaces/))
14
+ 2. I then found 20 images of each actor using Microsoft Bing Search API using queries such as *"Brad Pitt, actor or actress"*
15
+
16
+ Note that due to API limits, I only took images from 1,000 actors.
17
+
18
+ ## Application
19
+
20
+ The application is built with Gradio and deployed on HuggingFace Space. In the background, it uses:
21
+
22
+ 1. The [`face_recognition` library](https://github.com/ageitgey/face_recognition) to compute an embedding of the image
23
+ 2. Spotify's `annoy` library to efficiently search the closest actors based on the image embedding and a small database of actors' faces embeddings.
24
+ 3. Show you your best matches!
25
+
combine_actors_data.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from datetime import datetime
3
+
4
+
5
+ def process_actors_data(keep_alive: bool = True):
6
+ current_year = datetime.now().year
7
+
8
+ # Read actors data
9
+ df = pd.read_csv("data/name.basics.tsv", sep="\t")
10
+ df["birthYear"] = pd.to_numeric(df["birthYear"], errors="coerce")
11
+ df["deathYear"] = pd.to_numeric(df["deathYear"], errors="coerce")
12
+
13
+ # Prepare and cleanup actors data
14
+ if keep_alive:
15
+ df = df[df["deathYear"].isna()]
16
+ df = df[df.knownForTitles.apply(lambda x: len(x)) > 0]
17
+ df = df.dropna(subset=["primaryProfession"])
18
+ df = df[df.primaryProfession.apply(lambda x: "actor" in x.split(","))]
19
+ df = df[df.knownForTitles != "\\N"]
20
+ df = df.dropna(subset=["birthYear"])
21
+ #df["knownForTitles"] = df["knownForTitles"].apply(lambda x: x.split(","))
22
+
23
+ #dfat = df[["nconst", "knownForTitles"]].explode("knownForTitles")
24
+ #dfat.columns = ["nconst", "tconst"]
25
+ dfat = pd.read_csv("data/title.principals.tsv.gz", sep="\t")
26
+ dfat = dfat[dfat.category.isin(["actor", "self"])][["tconst", "nconst"]]
27
+
28
+
29
+ # Get data for the movies/shows the actors were known for
30
+ dftr = pd.read_csv("data/title.ratings.tsv", sep="\t")
31
+ dftb = pd.read_csv("data/title.basics.tsv", sep="\t")
32
+ dftb["startYear"] = pd.to_numeric(dftb["startYear"], errors="coerce")
33
+ dftb["endYear"] = pd.to_numeric(dftb["endYear"], errors="coerce")
34
+
35
+ # Estimate last year the show/movie was released (TV shows span several years and might still be active)
36
+ dftb.loc[(dftb.titleType.isin(["tvSeries", "tvMiniSeries"]) & (dftb.endYear.isna())), "lastYear"] = current_year
37
+ dftb["lastYear"] = dftb["lastYear"].fillna(dftb["startYear"])
38
+ dftb = dftb.dropna(subset=["lastYear"])
39
+ dftb = dftb[dftb.isAdult == 0]
40
+
41
+ # Aggregate stats for all movies the actor was known for
42
+ dft = pd.merge(dftb, dftr, how="inner", on="tconst")
43
+ del dftb, dftr
44
+ dfat = pd.merge(dfat, dft, how="inner", on="tconst")
45
+ del dft
46
+ dfat["totalRating"] = dfat.averageRating*dfat.numVotes
47
+ dfat = dfat.groupby("nconst").agg({"averageRating": "mean", "totalRating": "sum", "numVotes": "sum", "tconst": "count", "startYear": "min", "lastYear": "max"})
48
+
49
+ # Merge everything with actor data and cleanup
50
+ df = df.drop(["deathYear", "knownForTitles", "primaryProfession"], axis=1)
51
+ df = pd.merge(df, dfat, how="inner", on="nconst").sort_values("totalRating", ascending=False)
52
+ df = df.dropna(subset=["birthYear", "startYear", "lastYear"])
53
+ df[["birthYear", "startYear", "lastYear"]] = df[["birthYear", "startYear", "lastYear"]].astype(int)
54
+ df = df.round(2)
55
+
56
+ return df
57
+
58
+
59
+ if __name__ == "__main__":
60
+ df = process_actors_data()
61
+ df.to_csv("data/imdb_actors.csv", index=False)
data/.gitkeep ADDED
File without changes
download_imdb_data.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gzip
3
+ import shutil
4
+ from urllib.request import urlretrieve
5
+ from tqdm import tqdm
6
+
7
+ def download_large_file(url: str, output_file: str):
8
+ if not os.path.exists(output_file):
9
+ urlretrieve(url, output_file)
10
+
11
+ def unzip_file(input_file):
12
+ output_file = os.path.splitext(input_file)[0]
13
+ if not os.path.exists(output_file):
14
+ with gzip.open(input_file, "rb") as f_in:
15
+ # Input file has the format xxx.tsv.gz
16
+ with open(output_file, "wb") as f_out:
17
+ shutil.copyfileobj(f_in, f_out)
18
+
19
+ if __name__ == "__main__":
20
+ imdb_url = "https://datasets.imdbws.com"
21
+ filenames = [
22
+ "name.basics.tsv.gz",
23
+ "title.basics.tsv.gz",
24
+ "title.ratings.tsv.gz",
25
+ "title.principals.tsv.gz"
26
+ ]
27
+ for filename in tqdm(filenames):
28
+ url = f"{imdb_url}/{filename}"
29
+ output_file = os.path.join("data", filename)
30
+ download_large_file(url, output_file)
31
+ unzip_file(output_file)
32
+
get_images_data.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import pandas as pd
4
+ import os
5
+ import time
6
+
7
+ from datetime import datetime
8
+ from tqdm import tqdm
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
+
13
+ BING_API_KEY = os.getenv("BING_API_KEY", None)
14
+
15
+ def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
16
+ """Get a list of actor images from the Bing Image Search API"""
17
+ if api_key is None:
18
+ raise ValueError("You must provide a Bing API key")
19
+
20
+ headers = {
21
+ "Ocp-Apim-Subscription-Key": BING_API_KEY
22
+ }
23
+ query = f"{name}, actor or actress"
24
+ params = {
25
+ "q": query,
26
+ "count": count,
27
+ "imageType": "Photo",
28
+ "safeSearch": "Strict",
29
+ "imageContent": "Face",
30
+ "freshness": "Year"
31
+ }
32
+ response = requests.get(
33
+ f"https://api.bing.microsoft.com/v7.0/images/search",
34
+ headers=headers,
35
+ params=params
36
+ )
37
+
38
+ if response.status_code == 200:
39
+ return response.json()
40
+
41
+ def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
42
+ """Read and filter the list of actors"""
43
+
44
+ df = pd.read_csv("data/imdb_actors.csv")
45
+ if last_year_active:
46
+ df = df[df["lastYear"] >= last_year_active]
47
+
48
+ if sort_by:
49
+ df = df.sort_values(sort_by, ascending=False)
50
+
51
+ if max_actors:
52
+ df = df.head(max_actors)
53
+
54
+ return df
55
+
56
+ def store_all_actor_images_data(
57
+ max_actors: int = None,
58
+ images_per_actor: int = 10,
59
+ last_year_active: int = None,
60
+ output_file = None,
61
+ max_api_calls_per_second: int = 3
62
+ ):
63
+ """Get images data for each actor from the Bing Image Search API and store the results as csv"""
64
+
65
+ df = read_actors_list(max_actors, last_year_active)
66
+ df_im = None
67
+ if output_file:
68
+ try:
69
+ df_im = pd.read_csv(output_file)
70
+ except:
71
+ # file does not exists yet
72
+ pass
73
+
74
+ # remove actors for which we already have images data
75
+ if df_im is not None:
76
+ df = df[~df["nconst"].isin(df_im["nconst"].unique())]
77
+
78
+ print(f"Start retrieving images from Bing for {len(df)} actors")
79
+ for _, row in tqdm(df.iterrows(), total=df.shape[0]):
80
+ images_data = get_actor_images(
81
+ name=row["primaryName"],
82
+ count=images_per_actor
83
+ )
84
+ df_im_tmp = pd.DataFrame(images_data["value"])
85
+ df_im_tmp["nconst"] = row["nconst"]
86
+ df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
87
+
88
+ if df_im is not None:
89
+ df_im = pd.concat([df_im, df_im_tmp])
90
+ else:
91
+ df_im = df_im_tmp
92
+
93
+ # Store progress
94
+ df_im.to_csv(output_file, index=False)
95
+
96
+ # Limit speed of requests to Bing Search (3 calls per seconds)
97
+ time.sleep(1.0 / max_api_calls_per_second)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ store_all_actor_images_data(
102
+ output_file="data/actors_images.csv",
103
+ max_actors=1000,
104
+ images_per_actor=20,
105
+ last_year_active=datetime.now().year - 5,
106
+ max_api_calls_per_second=2
107
+ )
process_images.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import face_recognition
2
+ import requests
3
+ import pandas as pd
4
+ from io import BytesIO
5
+ from tqdm import tqdm
6
+
7
+
8
+ def get_image(url: str):
9
+ response = requests.get(url)
10
+ response.raise_for_status()
11
+ img_file_object = BytesIO(response.content)
12
+ return face_recognition.load_image_file(img_file_object)
13
+
14
+ def get_embeddings(url: str):
15
+ try:
16
+ image = get_image(url)
17
+ return list(face_recognition.face_encodings(image, num_jitters=5, model="large")[0])
18
+ except Exception as e:
19
+ print(e)
20
+
21
+ def process_all_images(input_file, output_file):
22
+ df = pd.read_csv(input_file)[["nconst","contentUrl"]]
23
+
24
+ try:
25
+ df_emb = pd.read_csv(output_file)
26
+ df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
27
+ except:
28
+ # file does not exists yet
29
+ df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
30
+
31
+ print(f"Start processing of {df.shape[0]} images")
32
+ df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
33
+ for i, row in tqdm(df.iterrows(), total=df.shape[0]):
34
+ embeddings = get_embeddings(row["contentUrl"])
35
+ if embeddings:
36
+ new_row = row.copy()
37
+ new_row["embeddings"] = embeddings
38
+ df_emb = df_emb.append(new_row, ignore_index=True)
39
+ df_emb.to_csv(output_file, index=False)
40
+
41
+ return df_emb
42
+
43
+ def build_annoy_index():
44
+ pass
45
+
46
+ if __name__ == "__main__":
47
+ output_file = "data/actors_embeddings.csv"
48
+ df_embeddings = process_all_images(input_file="data/actors_images.csv", output_file=output_file)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Inference
2
+ cmake # required for dlib (used by face_recognition)
3
+ face_recognition
4
+ annoy
5
+
6
+ # Preprocessing
7
+ microsoft-bing-imagesearch
8
+ python-dotenv
9
+ pandas
10
+ tqdm