{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from datetime import datetime\n", "\n", "current_year = datetime.now().year\n", "keep_alive = True" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read actors data\n", "df = pd.read_csv(\"data/name.basics.tsv\", sep=\"\\t\")\n", "df[\"birthYear\"] = pd.to_numeric(df[\"birthYear\"], errors=\"coerce\")\n", "df[\"deathYear\"] = pd.to_numeric(df[\"deathYear\"], errors=\"coerce\")\n", "\n", "# Prepare and cleanup actors data\n", "if keep_alive:\n", " df = df[df[\"deathYear\"].isna()]\n", "\n", "# Drop rows with incomplete data\n", "df = df.dropna(subset=[\"primaryProfession\", \"birthYear\"])\n", "df = df[df.knownForTitles != \"\\\\N\"]\n", "\n", "# Get if a person is an actor or actress\n", "df[\"is_actor\"] = df.primaryProfession.apply(lambda x: \"actor\" in x.split(\",\"))\n", "df[\"is_actress\"] = df.primaryProfession.apply(lambda x: \"actress\" in x.split(\",\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A note on genders: I do not have data as to which gender an actor or actress identify as. It does not matter for this exercise in any case as we plan to look at facial feature irrespective of gender. I use the actor/actress information for two reasons:\n", "\n", "1. I only want to keep people who acted in a movie/show, not the rest of the production crew (which may or may not be a good idea in the first place)\n", "2. When doing the Bing Search, I realize that for some people that have homonyms in other professions (such as Graham Green), I need to add the word \"actor\" or \"actress\" to the search to get more reliable pictures. I initially only added *actor/actress* in the query which returned strange results in some cases" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nconst
is_actoris_actress
FalseTrue1554197
TrueFalse2537757
True222
\n", "
" ], "text/plain": [ " nconst\n", "is_actor is_actress \n", "False True 1554197\n", "True False 2537757\n", " True 222" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby([\"is_actor\", \"is_actress\"]).count()[[\"nconst\"]]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nconstprimaryNamebirthYeardeathYearprimaryProfessionknownForTitlesis_actoris_actress
98892nm0103696Moya Brady1962.0NaNactor,actress,soundtracktt0457513,tt1054606,tt0110647,tt0414387TrueTrue
116253nm0122062Debbie DavidNaNNaNactor,actress,special_effectstt0092455,tt0104743,tt0112178,tt0096875TrueTrue
301992nm0318693Kannu GillNaNNaNactress,actortt0119721,tt0130197,tt0150992,tt0292490TrueTrue
830244nm0881417Mansi UpadhyayNaNNaNactress,actortt3815878,tt0374887,tt14412608,tt10719514TrueTrue
954524nm10034909Cheryl KannNaNNaNactor,actresstt8813608TrueTrue
968196nm1004934Niloufar SafaieNaNNaNactor,actresstt0247638,tt1523296TrueTrue
975084nm10056470Lydia BartonNaNNaNactor,actress\\NTrueTrue
1235242nm10334756Chesca Foe-a-manNaNNaNmiscellaneous,actor,actresstt9050468,tt5232792TrueTrue
1353828nm10460818Bhumika BarotNaNNaNactress,actortt15102968,tt11569584,tt9747194,tt10795628TrueTrue
1461875nm10576223Allison OrrNaNNaNactor,actress\\NTrueTrue
\n", "
" ], "text/plain": [ " nconst primaryName birthYear deathYear \\\n", "98892 nm0103696 Moya Brady 1962.0 NaN \n", "116253 nm0122062 Debbie David NaN NaN \n", "301992 nm0318693 Kannu Gill NaN NaN \n", "830244 nm0881417 Mansi Upadhyay NaN NaN \n", "954524 nm10034909 Cheryl Kann NaN NaN \n", "968196 nm1004934 Niloufar Safaie NaN NaN \n", "975084 nm10056470 Lydia Barton NaN NaN \n", "1235242 nm10334756 Chesca Foe-a-man NaN NaN \n", "1353828 nm10460818 Bhumika Barot NaN NaN \n", "1461875 nm10576223 Allison Orr NaN NaN \n", "\n", " primaryProfession \\\n", "98892 actor,actress,soundtrack \n", "116253 actor,actress,special_effects \n", "301992 actress,actor \n", "830244 actress,actor \n", "954524 actor,actress \n", "968196 actor,actress \n", "975084 actor,actress \n", "1235242 miscellaneous,actor,actress \n", "1353828 actress,actor \n", "1461875 actor,actress \n", "\n", " knownForTitles is_actor is_actress \n", "98892 tt0457513,tt1054606,tt0110647,tt0414387 True True \n", "116253 tt0092455,tt0104743,tt0112178,tt0096875 True True \n", "301992 tt0119721,tt0130197,tt0150992,tt0292490 True True \n", "830244 tt3815878,tt0374887,tt14412608,tt10719514 True True \n", "954524 tt8813608 True True \n", "968196 tt0247638,tt1523296 True True \n", "975084 \\N True True \n", "1235242 tt9050468,tt5232792 True True \n", "1353828 tt15102968,tt11569584,tt9747194,tt10795628 True True \n", "1461875 \\N True True " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.is_actor & df.is_actress].head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A few people are marked both as actor and actress in the IMDb data. Manually looking at these cases, it seems to be an error in the DB and they are actually actresses. " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Keep only actors and actresses in the dataset\n", "# Assume that if someone is both marked as actor/actress, it's an actress\n", "df = df[df.is_actor | df.is_actress]\n", "\n", "df[\"role\"] = \"other\"\n", "df.loc[df.is_actor, \"role\"] = \"actor\"\n", "df.loc[df.is_actress, \"role\"] = \"actress\" " ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nconst
role
actor2537757
actress1554419
\n", "
" ], "text/plain": [ " nconst\n", "role \n", "actor 2537757\n", "actress 1554419" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(\"role\")[[\"nconst\"]].count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get full list of movies/shows by actor\n", "dfat = pd.read_csv(\"data/title.principals.tsv.gz\", sep=\"\\t\")\n", "dfat = dfat[dfat.category.isin([\"actor\", \"actress\", \"self\"])][[\"tconst\", \"nconst\"]]\n", "\n", "# Get data for the movies/shows the actors appeared in\n", "dftr = pd.read_csv(\"data/title.ratings.tsv\", sep=\"\\t\")\n", "dftb = pd.read_csv(\"data/title.basics.tsv\", sep=\"\\t\")\n", "dftb[\"startYear\"] = pd.to_numeric(dftb[\"startYear\"], errors=\"coerce\")\n", "dftb[\"endYear\"] = pd.to_numeric(dftb[\"endYear\"], errors=\"coerce\")\n", "\n", "# Estimate last year the show/movie was released (TV shows span several years and might still be active)\n", "# This is used to later filter for actors that were recently acting in something\n", "dftb.loc[(dftb.titleType.isin([\"tvSeries\", \"tvMiniSeries\"]) & (dftb.endYear.isna())), \"lastYear\"] = current_year\n", "dftb[\"lastYear\"] = dftb[\"lastYear\"].fillna(dftb[\"startYear\"])\n", "dftb = dftb.dropna(subset=[\"lastYear\"])\n", "dftb = dftb[dftb.isAdult == 0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Aggregate stats for all movies the actor was known for\n", "dft = pd.merge(dftb, dftr, how=\"inner\", on=\"tconst\")\n", "del dftb, dftr\n", "dfat = pd.merge(dfat, dft, how=\"inner\", on=\"tconst\")\n", "del dft\n", "dfat[\"totalRating\"] = dfat.averageRating*dfat.numVotes\n", "dfat = dfat.groupby(\"nconst\").agg({\n", " \"averageRating\": \"mean\", \n", " \"totalRating\": \"sum\", \n", " \"numVotes\": \"sum\", \n", " \"tconst\": \"count\", \n", " \"startYear\": \"min\", \n", " \"lastYear\": \"max\"\n", "})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Merge everything with actor data and cleanup\n", "df = df.drop([\"deathYear\", \"knownForTitles\", \"primaryProfession\"], axis=1)\n", "df = pd.merge(df, dfat, how=\"inner\", on=\"nconst\").sort_values(\"totalRating\", ascending=False)\n", "df = df.dropna(subset=[\"birthYear\", \"startYear\", \"lastYear\"])\n", "df[[\"birthYear\", \"startYear\", \"lastYear\"]] = df[[\"birthYear\", \"startYear\", \"lastYear\"]].astype(int)\n", "df = df.round(2)" ] } ], "metadata": { "interpreter": { "hash": "90e1e830ac57dfc2c41e3e7a76c8ffd4bb6262b307f4273d56b17cf39c34bbe6" }, "kernelspec": { "display_name": "Python 3.7.11 64-bit ('actor_matching': conda)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }