Spaces:

nbeuchat
/

actors_matching

Runtime error

+# Actor matching demo
+Who should play Hannibal (the Carthaginian, not the cannibal) if HBO ever adapts his story? How about you? Who should be your actor?
+This application lets you input an image and see the top three actors that more closely resemble the image based on facial features.
+Try it out on HugginFace _[Coming Soon]_
+## Data
+The data comes from two sources:
+1. I built a list of relevant actors that have been in popular movies across their careers. The datasets that I used to build can be found on the [IMDB datasets page](https://datasets.imdbws.com/) (see instructions [here](https://www.imdb.com/interfaces/))
+2. I then found 20 images of each actor using Microsoft Bing Search API using queries such as *"Brad Pitt, actor or actress"*
+Note that due to API limits, I only took images from 1,000 actors.
+## Application
+The application is built with Gradio and deployed on HuggingFace Space. In the background, it uses:
+1. The [`face_recognition` library](https://github.com/ageitgey/face_recognition) to compute an embedding of the image
+2. Spotify's `annoy` library to efficiently search the closest actors based on the image embedding and a small database of actors' faces embeddings.
+3. Show you your best matches!

combine_actors_data.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import pandas as pd
+from datetime import datetime
+def process_actors_data(keep_alive: bool = True):
+    current_year = datetime.now().year
+    # Read actors data
+    df = pd.read_csv("data/name.basics.tsv", sep="\t")
+    df["birthYear"] = pd.to_numeric(df["birthYear"], errors="coerce")
+    df["deathYear"] = pd.to_numeric(df["deathYear"], errors="coerce")
+    # Prepare and cleanup actors data
+    if keep_alive:
+        df = df[df["deathYear"].isna()]
+    df = df[df.knownForTitles.apply(lambda x: len(x)) > 0]
+    df = df.dropna(subset=["primaryProfession"])
+    df = df[df.primaryProfession.apply(lambda x: "actor" in x.split(","))]
+    df = df[df.knownForTitles != "\\N"]
+    df = df.dropna(subset=["birthYear"])
+    #df["knownForTitles"] = df["knownForTitles"].apply(lambda x: x.split(","))
+    #dfat = df[["nconst", "knownForTitles"]].explode("knownForTitles")
+    #dfat.columns = ["nconst", "tconst"]
+    dfat = pd.read_csv("data/title.principals.tsv.gz", sep="\t")
+    dfat = dfat[dfat.category.isin(["actor", "self"])][["tconst", "nconst"]]
+    # Get data for the movies/shows the actors were known for
+    dftr = pd.read_csv("data/title.ratings.tsv", sep="\t")
+    dftb = pd.read_csv("data/title.basics.tsv", sep="\t")
+    dftb["startYear"] = pd.to_numeric(dftb["startYear"], errors="coerce")
+    dftb["endYear"] = pd.to_numeric(dftb["endYear"], errors="coerce")
+    # Estimate last year the show/movie was released (TV shows span several years and might still be active)
+    dftb.loc[(dftb.titleType.isin(["tvSeries", "tvMiniSeries"]) & (dftb.endYear.isna())), "lastYear"] = current_year
+    dftb["lastYear"] = dftb["lastYear"].fillna(dftb["startYear"])
+    dftb = dftb.dropna(subset=["lastYear"])
+    dftb = dftb[dftb.isAdult == 0]
+    # Aggregate stats for all movies the actor was known for
+    dft = pd.merge(dftb, dftr, how="inner", on="tconst")
+    del dftb, dftr
+    dfat = pd.merge(dfat, dft, how="inner", on="tconst")
+    del dft
+    dfat["totalRating"] = dfat.averageRating*dfat.numVotes
+    dfat = dfat.groupby("nconst").agg({"averageRating": "mean", "totalRating": "sum", "numVotes": "sum", "tconst": "count", "startYear": "min", "lastYear": "max"})
+    # Merge everything with actor data and cleanup
+    df = df.drop(["deathYear", "knownForTitles", "primaryProfession"], axis=1)
+    df = pd.merge(df, dfat, how="inner", on="nconst").sort_values("totalRating", ascending=False)
+    df = df.dropna(subset=["birthYear", "startYear", "lastYear"])
+    df[["birthYear", "startYear", "lastYear"]] = df[["birthYear", "startYear", "lastYear"]].astype(int)
+    df = df.round(2)
+    return df
+if __name__ == "__main__":
+    df = process_actors_data()
+    df.to_csv("data/imdb_actors.csv", index=False)

data/.gitkeep ADDED Viewed

File without changes

download_imdb_data.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+import gzip
+import shutil
+from urllib.request import urlretrieve
+from tqdm import tqdm
+def download_large_file(url: str, output_file: str):
+    if not os.path.exists(output_file):
+        urlretrieve(url, output_file)
+def unzip_file(input_file):
+    output_file = os.path.splitext(input_file)[0]
+    if not os.path.exists(output_file):
+        with gzip.open(input_file, "rb") as f_in:
+            # Input file has the format xxx.tsv.gz
+            with open(output_file, "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+if __name__ == "__main__":
+    imdb_url = "https://datasets.imdbws.com"
+    filenames = [
+        "name.basics.tsv.gz",
+        "title.basics.tsv.gz",
+        "title.ratings.tsv.gz",
+        "title.principals.tsv.gz"
+    ]
+    for filename in tqdm(filenames):
+        url = f"{imdb_url}/{filename}"
+        output_file = os.path.join("data", filename)
+        download_large_file(url, output_file)
+        unzip_file(output_file)

get_images_data.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import os
+import requests
+import pandas as pd
+import os
+import time
+from datetime import datetime
+from tqdm import tqdm
+from dotenv import load_dotenv
+load_dotenv()
+BING_API_KEY = os.getenv("BING_API_KEY", None)
+def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
+    """Get a list of actor images from the Bing Image Search API"""
+    if api_key is None:
+        raise ValueError("You must provide a Bing API key")
+    headers = {
+        "Ocp-Apim-Subscription-Key": BING_API_KEY
+    }
+    query = f"{name}, actor or actress"
+    params = {
+        "q": query,
+        "count": count,
+        "imageType": "Photo",
+        "safeSearch": "Strict",
+        "imageContent": "Face",
+        "freshness": "Year"
+    }
+    response = requests.get(
+        f"https://api.bing.microsoft.com/v7.0/images/search",
+        headers=headers,
+        params=params
+    )
+    if response.status_code == 200:
+        return response.json()
+def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
+    """Read and filter the list of actors"""
+    df = pd.read_csv("data/imdb_actors.csv")
+    if last_year_active:
+        df = df[df["lastYear"] >= last_year_active]
+    if sort_by:
+        df = df.sort_values(sort_by, ascending=False)
+    if max_actors:
+        df = df.head(max_actors)
+    return df
+def store_all_actor_images_data(
+    max_actors: int = None,
+    images_per_actor: int = 10,
+    last_year_active: int = None,
+    output_file = None,
+    max_api_calls_per_second: int = 3
+):
+    """Get images data for each actor from the Bing Image Search API and store the results as csv"""
+    df = read_actors_list(max_actors, last_year_active)
+    df_im = None
+    if output_file:
+        try:
+            df_im = pd.read_csv(output_file)
+        except:
+            # file does not exists yet
+            pass
+    # remove actors for which we already have images data
+    if df_im is not None:
+        df = df[~df["nconst"].isin(df_im["nconst"].unique())]
+    print(f"Start retrieving images from Bing for {len(df)} actors")
+    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
+        images_data = get_actor_images(
+            name=row["primaryName"],
+            count=images_per_actor
+        )
+        df_im_tmp = pd.DataFrame(images_data["value"])
+        df_im_tmp["nconst"] = row["nconst"]
+        df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
+        if df_im is not None:
+            df_im = pd.concat([df_im, df_im_tmp])
+        else:
+            df_im = df_im_tmp
+        # Store progress
+        df_im.to_csv(output_file, index=False)
+        # Limit speed of requests to Bing Search (3 calls per seconds)
+        time.sleep(1.0 / max_api_calls_per_second)
+if __name__ == "__main__":
+    store_all_actor_images_data(
+        output_file="data/actors_images.csv",
+        max_actors=1000,
+        images_per_actor=20,
+        last_year_active=datetime.now().year - 5,
+        max_api_calls_per_second=2
+    )

process_images.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import face_recognition
+import requests
+import pandas as pd
+from io import BytesIO
+from tqdm import tqdm
+def get_image(url: str):
+    response = requests.get(url)
+    response.raise_for_status()
+    img_file_object = BytesIO(response.content)
+    return face_recognition.load_image_file(img_file_object)
+def get_embeddings(url: str):
+    try:
+        image = get_image(url)
+        return list(face_recognition.face_encodings(image, num_jitters=5, model="large")[0])
+    except Exception as e:
+        print(e)
+def process_all_images(input_file, output_file):
+    df = pd.read_csv(input_file)[["nconst","contentUrl"]]
+    try:
+        df_emb = pd.read_csv(output_file)
+        df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
+    except:
+        # file does not exists yet
+        df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
+    print(f"Start processing of {df.shape[0]} images")
+    df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
+    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
+        embeddings = get_embeddings(row["contentUrl"])
+        if embeddings:
+            new_row = row.copy()
+            new_row["embeddings"] = embeddings
+            df_emb = df_emb.append(new_row, ignore_index=True)
+            df_emb.to_csv(output_file, index=False)
+    return df_emb
+def build_annoy_index():
+    pass
+if __name__ == "__main__":
+    output_file = "data/actors_embeddings.csv"
+    df_embeddings = process_all_images(input_file="data/actors_images.csv", output_file=output_file)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+# Inference
+cmake  # required for dlib (used by face_recognition)
+face_recognition
+annoy
+# Preprocessing
+microsoft-bing-imagesearch
+python-dotenv
+pandas
+tqdm