File size: 2,782 Bytes
6e89871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cecccf
6e89871
 
 
 
 
 
 
0cecccf
6e89871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
from datetime import datetime
    

def process_actors_data(keep_alive: bool = True):
    current_year = datetime.now().year

    # Read actors data
    df = pd.read_csv("data/name.basics.tsv", sep="\t")
    df["birthYear"] = pd.to_numeric(df["birthYear"], errors="coerce")
    df["deathYear"] = pd.to_numeric(df["deathYear"], errors="coerce")

    # Prepare and cleanup actors data
    if keep_alive:
        df = df[df["deathYear"].isna()]
    df = df[df.knownForTitles.apply(lambda x: len(x)) > 0]
    df = df.dropna(subset=["primaryProfession"])
    df = df[df.primaryProfession.apply(lambda x: any([p in {"actor", "actress"} for p in x.split(",")]))]
    df = df[df.knownForTitles != "\\N"]
    df = df.dropna(subset=["birthYear"])
    #df["knownForTitles"] = df["knownForTitles"].apply(lambda x: x.split(","))

    #dfat = df[["nconst", "knownForTitles"]].explode("knownForTitles")
    #dfat.columns = ["nconst", "tconst"]
    dfat = pd.read_csv("data/title.principals.tsv.gz", sep="\t")
    dfat = dfat[dfat.category.isin(["actor", "actress", "self"])][["tconst", "nconst"]]

    
    # Get data for the movies/shows the actors were known for
    dftr = pd.read_csv("data/title.ratings.tsv", sep="\t")
    dftb = pd.read_csv("data/title.basics.tsv", sep="\t")
    dftb["startYear"] = pd.to_numeric(dftb["startYear"], errors="coerce")
    dftb["endYear"] = pd.to_numeric(dftb["endYear"], errors="coerce")

    # Estimate last year the show/movie was released (TV shows span several years and might still be active)
    dftb.loc[(dftb.titleType.isin(["tvSeries", "tvMiniSeries"]) & (dftb.endYear.isna())), "lastYear"] = current_year
    dftb["lastYear"] = dftb["lastYear"].fillna(dftb["startYear"])
    dftb = dftb.dropna(subset=["lastYear"])
    dftb = dftb[dftb.isAdult == 0]

    # Aggregate stats for all movies the actor was known for
    dft = pd.merge(dftb, dftr, how="inner", on="tconst")
    del dftb, dftr
    dfat = pd.merge(dfat, dft, how="inner", on="tconst")
    del dft
    dfat["totalRating"] = dfat.averageRating*dfat.numVotes
    dfat = dfat.groupby("nconst").agg({"averageRating": "mean", "totalRating": "sum", "numVotes": "sum", "tconst": "count", "startYear": "min", "lastYear": "max"})

    # Merge everything with actor data and cleanup
    df = df.drop(["deathYear", "knownForTitles", "primaryProfession"], axis=1)
    df = pd.merge(df, dfat, how="inner", on="nconst").sort_values("totalRating", ascending=False)
    df = df.dropna(subset=["birthYear", "startYear", "lastYear"])
    df[["birthYear", "startYear", "lastYear"]] = df[["birthYear", "startYear", "lastYear"]].astype(int)
    df = df.round(2)

    return df
    

if __name__ == "__main__":
    df = process_actors_data()
    df.to_csv("data/imdb_actors.csv", index=False)