Spaces:
Runtime error
Runtime error
scripts for downloading actors data and extract embeddings
Browse files- .env.example +1 -0
- .gitignore +4 -0
- README.md +25 -2
- combine_actors_data.py +61 -0
- data/.gitkeep +0 -0
- download_imdb_data.py +32 -0
- get_images_data.py +107 -0
- process_images.py +48 -0
- requirements.txt +10 -0
.env.example
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
BING_API_KEY=000000000000000000000000
|
.gitignore
CHANGED
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
1 |
# Byte-compiled / optimized / DLL files
|
2 |
__pycache__/
|
3 |
*.py[cod]
|
|
|
1 |
+
# data files from imdb
|
2 |
+
data/title.*.tsv*
|
3 |
+
data/name.*.tsv*
|
4 |
+
|
5 |
# Byte-compiled / optimized / DLL files
|
6 |
__pycache__/
|
7 |
*.py[cod]
|
README.md
CHANGED
@@ -1,2 +1,25 @@
|
|
1 |
-
#
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Actor matching demo
|
2 |
+
|
3 |
+
Who should play Hannibal (the Carthaginian, not the cannibal) if HBO ever adapts his story? How about you? Who should be your actor?
|
4 |
+
This application lets you input an image and see the top three actors that more closely resemble the image based on facial features.
|
5 |
+
|
6 |
+
Try it out on HugginFace _[Coming Soon]_
|
7 |
+
|
8 |
+
|
9 |
+
## Data
|
10 |
+
|
11 |
+
The data comes from two sources:
|
12 |
+
|
13 |
+
1. I built a list of relevant actors that have been in popular movies across their careers. The datasets that I used to build can be found on the [IMDB datasets page](https://datasets.imdbws.com/) (see instructions [here](https://www.imdb.com/interfaces/))
|
14 |
+
2. I then found 20 images of each actor using Microsoft Bing Search API using queries such as *"Brad Pitt, actor or actress"*
|
15 |
+
|
16 |
+
Note that due to API limits, I only took images from 1,000 actors.
|
17 |
+
|
18 |
+
## Application
|
19 |
+
|
20 |
+
The application is built with Gradio and deployed on HuggingFace Space. In the background, it uses:
|
21 |
+
|
22 |
+
1. The [`face_recognition` library](https://github.com/ageitgey/face_recognition) to compute an embedding of the image
|
23 |
+
2. Spotify's `annoy` library to efficiently search the closest actors based on the image embedding and a small database of actors' faces embeddings.
|
24 |
+
3. Show you your best matches!
|
25 |
+
|
combine_actors_data.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from datetime import datetime
|
3 |
+
|
4 |
+
|
5 |
+
def process_actors_data(keep_alive: bool = True):
|
6 |
+
current_year = datetime.now().year
|
7 |
+
|
8 |
+
# Read actors data
|
9 |
+
df = pd.read_csv("data/name.basics.tsv", sep="\t")
|
10 |
+
df["birthYear"] = pd.to_numeric(df["birthYear"], errors="coerce")
|
11 |
+
df["deathYear"] = pd.to_numeric(df["deathYear"], errors="coerce")
|
12 |
+
|
13 |
+
# Prepare and cleanup actors data
|
14 |
+
if keep_alive:
|
15 |
+
df = df[df["deathYear"].isna()]
|
16 |
+
df = df[df.knownForTitles.apply(lambda x: len(x)) > 0]
|
17 |
+
df = df.dropna(subset=["primaryProfession"])
|
18 |
+
df = df[df.primaryProfession.apply(lambda x: "actor" in x.split(","))]
|
19 |
+
df = df[df.knownForTitles != "\\N"]
|
20 |
+
df = df.dropna(subset=["birthYear"])
|
21 |
+
#df["knownForTitles"] = df["knownForTitles"].apply(lambda x: x.split(","))
|
22 |
+
|
23 |
+
#dfat = df[["nconst", "knownForTitles"]].explode("knownForTitles")
|
24 |
+
#dfat.columns = ["nconst", "tconst"]
|
25 |
+
dfat = pd.read_csv("data/title.principals.tsv.gz", sep="\t")
|
26 |
+
dfat = dfat[dfat.category.isin(["actor", "self"])][["tconst", "nconst"]]
|
27 |
+
|
28 |
+
|
29 |
+
# Get data for the movies/shows the actors were known for
|
30 |
+
dftr = pd.read_csv("data/title.ratings.tsv", sep="\t")
|
31 |
+
dftb = pd.read_csv("data/title.basics.tsv", sep="\t")
|
32 |
+
dftb["startYear"] = pd.to_numeric(dftb["startYear"], errors="coerce")
|
33 |
+
dftb["endYear"] = pd.to_numeric(dftb["endYear"], errors="coerce")
|
34 |
+
|
35 |
+
# Estimate last year the show/movie was released (TV shows span several years and might still be active)
|
36 |
+
dftb.loc[(dftb.titleType.isin(["tvSeries", "tvMiniSeries"]) & (dftb.endYear.isna())), "lastYear"] = current_year
|
37 |
+
dftb["lastYear"] = dftb["lastYear"].fillna(dftb["startYear"])
|
38 |
+
dftb = dftb.dropna(subset=["lastYear"])
|
39 |
+
dftb = dftb[dftb.isAdult == 0]
|
40 |
+
|
41 |
+
# Aggregate stats for all movies the actor was known for
|
42 |
+
dft = pd.merge(dftb, dftr, how="inner", on="tconst")
|
43 |
+
del dftb, dftr
|
44 |
+
dfat = pd.merge(dfat, dft, how="inner", on="tconst")
|
45 |
+
del dft
|
46 |
+
dfat["totalRating"] = dfat.averageRating*dfat.numVotes
|
47 |
+
dfat = dfat.groupby("nconst").agg({"averageRating": "mean", "totalRating": "sum", "numVotes": "sum", "tconst": "count", "startYear": "min", "lastYear": "max"})
|
48 |
+
|
49 |
+
# Merge everything with actor data and cleanup
|
50 |
+
df = df.drop(["deathYear", "knownForTitles", "primaryProfession"], axis=1)
|
51 |
+
df = pd.merge(df, dfat, how="inner", on="nconst").sort_values("totalRating", ascending=False)
|
52 |
+
df = df.dropna(subset=["birthYear", "startYear", "lastYear"])
|
53 |
+
df[["birthYear", "startYear", "lastYear"]] = df[["birthYear", "startYear", "lastYear"]].astype(int)
|
54 |
+
df = df.round(2)
|
55 |
+
|
56 |
+
return df
|
57 |
+
|
58 |
+
|
59 |
+
if __name__ == "__main__":
|
60 |
+
df = process_actors_data()
|
61 |
+
df.to_csv("data/imdb_actors.csv", index=False)
|
data/.gitkeep
ADDED
File without changes
|
download_imdb_data.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gzip
|
3 |
+
import shutil
|
4 |
+
from urllib.request import urlretrieve
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
def download_large_file(url: str, output_file: str):
|
8 |
+
if not os.path.exists(output_file):
|
9 |
+
urlretrieve(url, output_file)
|
10 |
+
|
11 |
+
def unzip_file(input_file):
|
12 |
+
output_file = os.path.splitext(input_file)[0]
|
13 |
+
if not os.path.exists(output_file):
|
14 |
+
with gzip.open(input_file, "rb") as f_in:
|
15 |
+
# Input file has the format xxx.tsv.gz
|
16 |
+
with open(output_file, "wb") as f_out:
|
17 |
+
shutil.copyfileobj(f_in, f_out)
|
18 |
+
|
19 |
+
if __name__ == "__main__":
|
20 |
+
imdb_url = "https://datasets.imdbws.com"
|
21 |
+
filenames = [
|
22 |
+
"name.basics.tsv.gz",
|
23 |
+
"title.basics.tsv.gz",
|
24 |
+
"title.ratings.tsv.gz",
|
25 |
+
"title.principals.tsv.gz"
|
26 |
+
]
|
27 |
+
for filename in tqdm(filenames):
|
28 |
+
url = f"{imdb_url}/{filename}"
|
29 |
+
output_file = os.path.join("data", filename)
|
30 |
+
download_large_file(url, output_file)
|
31 |
+
unzip_file(output_file)
|
32 |
+
|
get_images_data.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
import pandas as pd
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
|
7 |
+
from datetime import datetime
|
8 |
+
from tqdm import tqdm
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
BING_API_KEY = os.getenv("BING_API_KEY", None)
|
14 |
+
|
15 |
+
def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
|
16 |
+
"""Get a list of actor images from the Bing Image Search API"""
|
17 |
+
if api_key is None:
|
18 |
+
raise ValueError("You must provide a Bing API key")
|
19 |
+
|
20 |
+
headers = {
|
21 |
+
"Ocp-Apim-Subscription-Key": BING_API_KEY
|
22 |
+
}
|
23 |
+
query = f"{name}, actor or actress"
|
24 |
+
params = {
|
25 |
+
"q": query,
|
26 |
+
"count": count,
|
27 |
+
"imageType": "Photo",
|
28 |
+
"safeSearch": "Strict",
|
29 |
+
"imageContent": "Face",
|
30 |
+
"freshness": "Year"
|
31 |
+
}
|
32 |
+
response = requests.get(
|
33 |
+
f"https://api.bing.microsoft.com/v7.0/images/search",
|
34 |
+
headers=headers,
|
35 |
+
params=params
|
36 |
+
)
|
37 |
+
|
38 |
+
if response.status_code == 200:
|
39 |
+
return response.json()
|
40 |
+
|
41 |
+
def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
|
42 |
+
"""Read and filter the list of actors"""
|
43 |
+
|
44 |
+
df = pd.read_csv("data/imdb_actors.csv")
|
45 |
+
if last_year_active:
|
46 |
+
df = df[df["lastYear"] >= last_year_active]
|
47 |
+
|
48 |
+
if sort_by:
|
49 |
+
df = df.sort_values(sort_by, ascending=False)
|
50 |
+
|
51 |
+
if max_actors:
|
52 |
+
df = df.head(max_actors)
|
53 |
+
|
54 |
+
return df
|
55 |
+
|
56 |
+
def store_all_actor_images_data(
|
57 |
+
max_actors: int = None,
|
58 |
+
images_per_actor: int = 10,
|
59 |
+
last_year_active: int = None,
|
60 |
+
output_file = None,
|
61 |
+
max_api_calls_per_second: int = 3
|
62 |
+
):
|
63 |
+
"""Get images data for each actor from the Bing Image Search API and store the results as csv"""
|
64 |
+
|
65 |
+
df = read_actors_list(max_actors, last_year_active)
|
66 |
+
df_im = None
|
67 |
+
if output_file:
|
68 |
+
try:
|
69 |
+
df_im = pd.read_csv(output_file)
|
70 |
+
except:
|
71 |
+
# file does not exists yet
|
72 |
+
pass
|
73 |
+
|
74 |
+
# remove actors for which we already have images data
|
75 |
+
if df_im is not None:
|
76 |
+
df = df[~df["nconst"].isin(df_im["nconst"].unique())]
|
77 |
+
|
78 |
+
print(f"Start retrieving images from Bing for {len(df)} actors")
|
79 |
+
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
|
80 |
+
images_data = get_actor_images(
|
81 |
+
name=row["primaryName"],
|
82 |
+
count=images_per_actor
|
83 |
+
)
|
84 |
+
df_im_tmp = pd.DataFrame(images_data["value"])
|
85 |
+
df_im_tmp["nconst"] = row["nconst"]
|
86 |
+
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
|
87 |
+
|
88 |
+
if df_im is not None:
|
89 |
+
df_im = pd.concat([df_im, df_im_tmp])
|
90 |
+
else:
|
91 |
+
df_im = df_im_tmp
|
92 |
+
|
93 |
+
# Store progress
|
94 |
+
df_im.to_csv(output_file, index=False)
|
95 |
+
|
96 |
+
# Limit speed of requests to Bing Search (3 calls per seconds)
|
97 |
+
time.sleep(1.0 / max_api_calls_per_second)
|
98 |
+
|
99 |
+
|
100 |
+
if __name__ == "__main__":
|
101 |
+
store_all_actor_images_data(
|
102 |
+
output_file="data/actors_images.csv",
|
103 |
+
max_actors=1000,
|
104 |
+
images_per_actor=20,
|
105 |
+
last_year_active=datetime.now().year - 5,
|
106 |
+
max_api_calls_per_second=2
|
107 |
+
)
|
process_images.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import face_recognition
|
2 |
+
import requests
|
3 |
+
import pandas as pd
|
4 |
+
from io import BytesIO
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
|
8 |
+
def get_image(url: str):
|
9 |
+
response = requests.get(url)
|
10 |
+
response.raise_for_status()
|
11 |
+
img_file_object = BytesIO(response.content)
|
12 |
+
return face_recognition.load_image_file(img_file_object)
|
13 |
+
|
14 |
+
def get_embeddings(url: str):
|
15 |
+
try:
|
16 |
+
image = get_image(url)
|
17 |
+
return list(face_recognition.face_encodings(image, num_jitters=5, model="large")[0])
|
18 |
+
except Exception as e:
|
19 |
+
print(e)
|
20 |
+
|
21 |
+
def process_all_images(input_file, output_file):
|
22 |
+
df = pd.read_csv(input_file)[["nconst","contentUrl"]]
|
23 |
+
|
24 |
+
try:
|
25 |
+
df_emb = pd.read_csv(output_file)
|
26 |
+
df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
|
27 |
+
except:
|
28 |
+
# file does not exists yet
|
29 |
+
df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
|
30 |
+
|
31 |
+
print(f"Start processing of {df.shape[0]} images")
|
32 |
+
df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
|
33 |
+
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
|
34 |
+
embeddings = get_embeddings(row["contentUrl"])
|
35 |
+
if embeddings:
|
36 |
+
new_row = row.copy()
|
37 |
+
new_row["embeddings"] = embeddings
|
38 |
+
df_emb = df_emb.append(new_row, ignore_index=True)
|
39 |
+
df_emb.to_csv(output_file, index=False)
|
40 |
+
|
41 |
+
return df_emb
|
42 |
+
|
43 |
+
def build_annoy_index():
|
44 |
+
pass
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
output_file = "data/actors_embeddings.csv"
|
48 |
+
df_embeddings = process_all_images(input_file="data/actors_images.csv", output_file=output_file)
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Inference
|
2 |
+
cmake # required for dlib (used by face_recognition)
|
3 |
+
face_recognition
|
4 |
+
annoy
|
5 |
+
|
6 |
+
# Preprocessing
|
7 |
+
microsoft-bing-imagesearch
|
8 |
+
python-dotenv
|
9 |
+
pandas
|
10 |
+
tqdm
|