In [1]:
import pandas as pd
from datetime import datetime

current_year = datetime.now().year
keep_alive = True

In [None]:
# Read actors data
df = pd.read_csv("data/name.basics.tsv", sep="\t")
df["birthYear"] = pd.to_numeric(df["birthYear"], errors="coerce")
df["deathYear"] = pd.to_numeric(df["deathYear"], errors="coerce")

# Prepare and cleanup actors data
if keep_alive:
    df = df[df["deathYear"].isna()]

# Drop rows with incomplete data
df = df.dropna(subset=["primaryProfession", "birthYear"])
df = df[df.knownForTitles != "\\N"]

# Get if a person is an actor or actress
df["is_actor"] = df.primaryProfession.apply(lambda x: "actor" in x.split(","))
df["is_actress"] = df.primaryProfession.apply(lambda x: "actress" in x.split(","))

A note on genders: I do not have data as to which gender an actor or actress identify as. It does not matter for this exercise in any case as we plan to look at facial feature irrespective of gender. I use the actor/actress information for two reasons:

1. I only want to keep people who acted in a movie/show, not the rest of the production crew (which may or may not be a good idea in the first place)
2. When doing the Bing Search, I realize that for some people that have homonyms in other professions (such as Graham Green), I need to add the word "actor" or "actress" to the search to get more reliable pictures. I initially only added *actor/actress* in the query which returned strange results in some cases

In [17]:
df.groupby(["is_actor", "is_actress"]).count()[["nconst"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,nconst
is_actor,is_actress,Unnamed: 2_level_1
False,True,1554197
True,False,2537757
True,True,222


In [9]:
df[df.is_actor & df.is_actress].head(10)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,is_actor,is_actress
98892,nm0103696,Moya Brady,1962.0,,"actor,actress,soundtrack","tt0457513,tt1054606,tt0110647,tt0414387",True,True
116253,nm0122062,Debbie David,,,"actor,actress,special_effects","tt0092455,tt0104743,tt0112178,tt0096875",True,True
301992,nm0318693,Kannu Gill,,,"actress,actor","tt0119721,tt0130197,tt0150992,tt0292490",True,True
830244,nm0881417,Mansi Upadhyay,,,"actress,actor","tt3815878,tt0374887,tt14412608,tt10719514",True,True
954524,nm10034909,Cheryl Kann,,,"actor,actress",tt8813608,True,True
968196,nm1004934,Niloufar Safaie,,,"actor,actress","tt0247638,tt1523296",True,True
975084,nm10056470,Lydia Barton,,,"actor,actress",\N,True,True
1235242,nm10334756,Chesca Foe-a-man,,,"miscellaneous,actor,actress","tt9050468,tt5232792",True,True
1353828,nm10460818,Bhumika Barot,,,"actress,actor","tt15102968,tt11569584,tt9747194,tt10795628",True,True
1461875,nm10576223,Allison Orr,,,"actor,actress",\N,True,True


A few people are marked both as actor and actress in the IMDb data. Manually looking at these cases, it seems to be an error in the DB and they are actually actresses. 

In [12]:
# Keep only actors and actresses in the dataset
# Assume that if someone is both marked as actor/actress, it's an actress
df = df[df.is_actor | df.is_actress]

df["role"] = "other"
df.loc[df.is_actor, "role"] = "actor"
df.loc[df.is_actress, "role"] = "actress"  

In [18]:
df.groupby("role")[["nconst"]].count()

Unnamed: 0_level_0,nconst
role,Unnamed: 1_level_1
actor,2537757
actress,1554419


In [None]:
# Get full list of movies/shows by actor
dfat = pd.read_csv("data/title.principals.tsv.gz", sep="\t")
dfat = dfat[dfat.category.isin(["actor", "actress", "self"])][["tconst", "nconst"]]

# Get data for the movies/shows the actors appeared in
dftr = pd.read_csv("data/title.ratings.tsv", sep="\t")
dftb = pd.read_csv("data/title.basics.tsv", sep="\t")
dftb["startYear"] = pd.to_numeric(dftb["startYear"], errors="coerce")
dftb["endYear"] = pd.to_numeric(dftb["endYear"], errors="coerce")

# Estimate last year the show/movie was released (TV shows span several years and might still be active)
# This is used to later filter for actors that were recently acting in something
dftb.loc[(dftb.titleType.isin(["tvSeries", "tvMiniSeries"]) & (dftb.endYear.isna())), "lastYear"] = current_year
dftb["lastYear"] = dftb["lastYear"].fillna(dftb["startYear"])
dftb = dftb.dropna(subset=["lastYear"])
dftb = dftb[dftb.isAdult == 0]

In [None]:
# Aggregate stats for all movies the actor was known for
dft = pd.merge(dftb, dftr, how="inner", on="tconst")
del dftb, dftr
dfat = pd.merge(dfat, dft, how="inner", on="tconst")
del dft
dfat["totalRating"] = dfat.averageRating*dfat.numVotes
dfat = dfat.groupby("nconst").agg({
    "averageRating": "mean", 
    "totalRating": "sum", 
    "numVotes": "sum", 
    "tconst": "count", 
    "startYear": "min", 
    "lastYear": "max"
})

In [None]:
# Merge everything with actor data and cleanup
df = df.drop(["deathYear", "knownForTitles", "primaryProfession"], axis=1)
df = pd.merge(df, dfat, how="inner", on="nconst").sort_values("totalRating", ascending=False)
df = df.dropna(subset=["birthYear", "startYear", "lastYear"])
df[["birthYear", "startYear", "lastYear"]] = df[["birthYear", "startYear", "lastYear"]].astype(int)
df = df.round(2)