{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Filtros para reduzir o numero de instâncias dos conjuntos de dados\n",
"motivação: \n",
"\n",
"Percebi que existem filmes com poucas avaliações, como podemos dizer se um filme é bom com apenas 1 ou 2 avaliações?\n",
"\n",
"Portanto, aqui reduzimos o conjunto de dados ao remover usuários e filmes com pouca \"relevância\" segundo os conceitos citados anteriormente."
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## filtro de numero de avaliações por usuário e filme"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### defina o limite minimo para permanecer"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"n_ratings_movie = 10"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
" rating | \n",
" timestamp | \n",
"
\n",
" \n",
" userId | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 1 | \n",
" 4.0 | \n",
" 964982703 | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" 4.0 | \n",
" 964981247 | \n",
"
\n",
" \n",
" 1 | \n",
" 6 | \n",
" 4.0 | \n",
" 964982224 | \n",
"
\n",
" \n",
" 1 | \n",
" 47 | \n",
" 5.0 | \n",
" 964983815 | \n",
"
\n",
" \n",
" 1 | \n",
" 50 | \n",
" 5.0 | \n",
" 964982931 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movieId rating timestamp\n",
"userId \n",
"1 1 4.0 964982703\n",
"1 3 4.0 964981247\n",
"1 6 4.0 964982224\n",
"1 47 5.0 964983815\n",
"1 50 5.0 964982931"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings = pd.read_csv('../data/standard/ratings.csv', index_col='userId')\n",
"ratings.head()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
" rating | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 100836.000000 | \n",
" 100836.000000 | \n",
" 1.008360e+05 | \n",
"
\n",
" \n",
" mean | \n",
" 19435.295718 | \n",
" 3.501557 | \n",
" 1.205946e+09 | \n",
"
\n",
" \n",
" std | \n",
" 35530.987199 | \n",
" 1.042529 | \n",
" 2.162610e+08 | \n",
"
\n",
" \n",
" min | \n",
" 1.000000 | \n",
" 0.500000 | \n",
" 8.281246e+08 | \n",
"
\n",
" \n",
" 25% | \n",
" 1199.000000 | \n",
" 3.000000 | \n",
" 1.019124e+09 | \n",
"
\n",
" \n",
" 50% | \n",
" 2991.000000 | \n",
" 3.500000 | \n",
" 1.186087e+09 | \n",
"
\n",
" \n",
" 75% | \n",
" 8122.000000 | \n",
" 4.000000 | \n",
" 1.435994e+09 | \n",
"
\n",
" \n",
" max | \n",
" 193609.000000 | \n",
" 5.000000 | \n",
" 1.537799e+09 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movieId rating timestamp\n",
"count 100836.000000 100836.000000 1.008360e+05\n",
"mean 19435.295718 3.501557 1.205946e+09\n",
"std 35530.987199 1.042529 2.162610e+08\n",
"min 1.000000 0.500000 8.281246e+08\n",
"25% 1199.000000 3.000000 1.019124e+09\n",
"50% 2991.000000 3.500000 1.186087e+09\n",
"75% 8122.000000 4.000000 1.435994e+09\n",
"max 193609.000000 5.000000 1.537799e+09"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings.describe()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 610.000000\n",
"mean 165.304918\n",
"std 269.480584\n",
"min 20.000000\n",
"25% 35.000000\n",
"50% 70.500000\n",
"75% 168.000000\n",
"max 2698.000000\n",
"Name: count, dtype: float64"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings.index.value_counts().describe()"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 9724.000000\n",
"mean 10.369807\n",
"std 22.401005\n",
"min 1.000000\n",
"25% 1.000000\n",
"50% 3.000000\n",
"75% 9.000000\n",
"max 329.000000\n",
"Name: count, dtype: float64"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings['movieId'].value_counts().describe()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"def filter_n_ratings(df, column, n_ratings):\n",
" low = df[column].value_counts() >= n_ratings\n",
" low = low[low == True]\n",
" return df[df[column].isin(low.index)]\n"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"ratings = filter_n_ratings(ratings, 'movieId', n_ratings_movie)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
" rating | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 81116.000000 | \n",
" 81116.000000 | \n",
" 8.111600e+04 | \n",
"
\n",
" \n",
" mean | \n",
" 14857.178078 | \n",
" 3.573678 | \n",
" 1.197217e+09 | \n",
"
\n",
" \n",
" std | \n",
" 29539.336412 | \n",
" 1.018590 | \n",
" 2.167182e+08 | \n",
"
\n",
" \n",
" min | \n",
" 1.000000 | \n",
" 0.500000 | \n",
" 8.281246e+08 | \n",
"
\n",
" \n",
" 25% | \n",
" 1007.000000 | \n",
" 3.000000 | \n",
" 1.001562e+09 | \n",
"
\n",
" \n",
" 50% | \n",
" 2471.000000 | \n",
" 4.000000 | \n",
" 1.180447e+09 | \n",
"
\n",
" \n",
" 75% | \n",
" 6016.000000 | \n",
" 4.000000 | \n",
" 1.431955e+09 | \n",
"
\n",
" \n",
" max | \n",
" 187593.000000 | \n",
" 5.000000 | \n",
" 1.537799e+09 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movieId rating timestamp\n",
"count 81116.000000 81116.000000 8.111600e+04\n",
"mean 14857.178078 3.573678 1.197217e+09\n",
"std 29539.336412 1.018590 2.167182e+08\n",
"min 1.000000 0.500000 8.281246e+08\n",
"25% 1007.000000 3.000000 1.001562e+09\n",
"50% 2471.000000 4.000000 1.180447e+09\n",
"75% 6016.000000 4.000000 1.431955e+09\n",
"max 187593.000000 5.000000 1.537799e+09"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings.describe()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2269"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(ratings['movieId'].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"610"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(ratings.index.value_counts())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## aplicando filtro no movies dataset"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
" title | \n",
" genres | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Toy Story (1995) | \n",
" Adventure|Animation|Children|Comedy|Fantasy | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Jumanji (1995) | \n",
" Adventure|Children|Fantasy | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Grumpier Old Men (1995) | \n",
" Comedy|Romance | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Waiting to Exhale (1995) | \n",
" Comedy|Drama|Romance | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Father of the Bride Part II (1995) | \n",
" Comedy | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movieId title \\\n",
"0 1 Toy Story (1995) \n",
"1 2 Jumanji (1995) \n",
"2 3 Grumpier Old Men (1995) \n",
"3 4 Waiting to Exhale (1995) \n",
"4 5 Father of the Bride Part II (1995) \n",
"\n",
" genres \n",
"0 Adventure|Animation|Children|Comedy|Fantasy \n",
"1 Adventure|Children|Fantasy \n",
"2 Comedy|Romance \n",
"3 Comedy|Drama|Romance \n",
"4 Comedy "
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies = pd.read_csv('../data/standard/movies.csv')\n",
"movies.head()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"movies = movies[movies['movieId'].isin(ratings['movieId'])]"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 2269.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 20530.586161 | \n",
"
\n",
" \n",
" std | \n",
" 35185.840333 | \n",
"
\n",
" \n",
" min | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 1345.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 3256.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 8958.000000 | \n",
"
\n",
" \n",
" max | \n",
" 187593.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movieId\n",
"count 2269.000000\n",
"mean 20530.586161\n",
"std 35185.840333\n",
"min 1.000000\n",
"25% 1345.000000\n",
"50% 3256.000000\n",
"75% 8958.000000\n",
"max 187593.000000"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.describe()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Index: 2269 entries, 0 to 9709\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 movieId 2269 non-null int64 \n",
" 1 title 2269 non-null object\n",
" 2 genres 2269 non-null object\n",
"dtypes: int64(1), object(2)\n",
"memory usage: 70.9+ KB\n"
]
}
],
"source": [
"movies.info()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"movies.set_index('movieId', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" genres | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" Toy Story (1995) | \n",
" Adventure|Animation|Children|Comedy|Fantasy | \n",
"
\n",
" \n",
" 2 | \n",
" Jumanji (1995) | \n",
" Adventure|Children|Fantasy | \n",
"
\n",
" \n",
" 3 | \n",
" Grumpier Old Men (1995) | \n",
" Comedy|Romance | \n",
"
\n",
" \n",
" 5 | \n",
" Father of the Bride Part II (1995) | \n",
" Comedy | \n",
"
\n",
" \n",
" 6 | \n",
" Heat (1995) | \n",
" Action|Crime|Thriller | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 174055 | \n",
" Dunkirk (2017) | \n",
" Action|Drama|Thriller|War | \n",
"
\n",
" \n",
" 176371 | \n",
" Blade Runner 2049 (2017) | \n",
" Sci-Fi | \n",
"
\n",
" \n",
" 177765 | \n",
" Coco (2017) | \n",
" Adventure|Animation|Children | \n",
"
\n",
" \n",
" 179819 | \n",
" Star Wars: The Last Jedi (2017) | \n",
" Action|Adventure|Fantasy|Sci-Fi | \n",
"
\n",
" \n",
" 187593 | \n",
" Deadpool 2 (2018) | \n",
" Action|Comedy|Sci-Fi | \n",
"
\n",
" \n",
"
\n",
"
2269 rows × 2 columns
\n",
"
"
],
"text/plain": [
" title \\\n",
"movieId \n",
"1 Toy Story (1995) \n",
"2 Jumanji (1995) \n",
"3 Grumpier Old Men (1995) \n",
"5 Father of the Bride Part II (1995) \n",
"6 Heat (1995) \n",
"... ... \n",
"174055 Dunkirk (2017) \n",
"176371 Blade Runner 2049 (2017) \n",
"177765 Coco (2017) \n",
"179819 Star Wars: The Last Jedi (2017) \n",
"187593 Deadpool 2 (2018) \n",
"\n",
" genres \n",
"movieId \n",
"1 Adventure|Animation|Children|Comedy|Fantasy \n",
"2 Adventure|Children|Fantasy \n",
"3 Comedy|Romance \n",
"5 Comedy \n",
"6 Action|Crime|Thriller \n",
"... ... \n",
"174055 Action|Drama|Thriller|War \n",
"176371 Sci-Fi \n",
"177765 Adventure|Animation|Children \n",
"179819 Action|Adventure|Fantasy|Sci-Fi \n",
"187593 Action|Comedy|Sci-Fi \n",
"\n",
"[2269 rows x 2 columns]"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Aplicando filtros no conjuno de dados 'links'"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"links = pd.read_csv('../data/standard/links.csv', index_col='movieId')"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" imdbId | \n",
" tmdbId | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 9.742000e+03 | \n",
" 9734.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 6.771839e+05 | \n",
" 55162.123793 | \n",
"
\n",
" \n",
" std | \n",
" 1.107228e+06 | \n",
" 93653.481487 | \n",
"
\n",
" \n",
" min | \n",
" 4.170000e+02 | \n",
" 2.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 9.518075e+04 | \n",
" 9665.500000 | \n",
"
\n",
" \n",
" 50% | \n",
" 1.672605e+05 | \n",
" 16529.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 8.055685e+05 | \n",
" 44205.750000 | \n",
"
\n",
" \n",
" max | \n",
" 8.391976e+06 | \n",
" 525662.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" imdbId tmdbId\n",
"count 9.742000e+03 9734.000000\n",
"mean 6.771839e+05 55162.123793\n",
"std 1.107228e+06 93653.481487\n",
"min 4.170000e+02 2.000000\n",
"25% 9.518075e+04 9665.500000\n",
"50% 1.672605e+05 16529.000000\n",
"75% 8.055685e+05 44205.750000\n",
"max 8.391976e+06 525662.000000"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"links.describe()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"links = links[links.index.isin(movies.index)]"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" imdbId | \n",
" tmdbId | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 2.269000e+03 | \n",
" 2269.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 3.670981e+05 | \n",
" 19720.701190 | \n",
"
\n",
" \n",
" std | \n",
" 5.870238e+05 | \n",
" 49425.176137 | \n",
"
\n",
" \n",
" min | \n",
" 1.344200e+04 | \n",
" 5.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 9.965300e+04 | \n",
" 1452.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 1.199510e+05 | \n",
" 9071.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 3.471490e+05 | \n",
" 11566.000000 | \n",
"
\n",
" \n",
" max | \n",
" 5.463162e+06 | \n",
" 503475.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" imdbId tmdbId\n",
"count 2.269000e+03 2269.000000\n",
"mean 3.670981e+05 19720.701190\n",
"std 5.870238e+05 49425.176137\n",
"min 1.344200e+04 5.000000\n",
"25% 9.965300e+04 1452.000000\n",
"50% 1.199510e+05 9071.000000\n",
"75% 3.471490e+05 11566.000000\n",
"max 5.463162e+06 503475.000000"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"links.describe()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
" rating | \n",
" timestamp | \n",
"
\n",
" \n",
" userId | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 1 | \n",
" 4.0 | \n",
" 964982703 | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" 4.0 | \n",
" 964981247 | \n",
"
\n",
" \n",
" 1 | \n",
" 6 | \n",
" 4.0 | \n",
" 964982224 | \n",
"
\n",
" \n",
" 1 | \n",
" 47 | \n",
" 5.0 | \n",
" 964983815 | \n",
"
\n",
" \n",
" 1 | \n",
" 50 | \n",
" 5.0 | \n",
" 964982931 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 610 | \n",
" 159093 | \n",
" 3.0 | \n",
" 1493847704 | \n",
"
\n",
" \n",
" 610 | \n",
" 164179 | \n",
" 5.0 | \n",
" 1493845631 | \n",
"
\n",
" \n",
" 610 | \n",
" 166528 | \n",
" 4.0 | \n",
" 1493879365 | \n",
"
\n",
" \n",
" 610 | \n",
" 168250 | \n",
" 5.0 | \n",
" 1494273047 | \n",
"
\n",
" \n",
" 610 | \n",
" 168252 | \n",
" 5.0 | \n",
" 1493846352 | \n",
"
\n",
" \n",
"
\n",
"
81116 rows × 3 columns
\n",
"
"
],
"text/plain": [
" movieId rating timestamp\n",
"userId \n",
"1 1 4.0 964982703\n",
"1 3 4.0 964981247\n",
"1 6 4.0 964982224\n",
"1 47 5.0 964983815\n",
"1 50 5.0 964982931\n",
"... ... ... ...\n",
"610 159093 3.0 1493847704\n",
"610 164179 5.0 1493845631\n",
"610 166528 4.0 1493879365\n",
"610 168250 5.0 1494273047\n",
"610 168252 5.0 1493846352\n",
"\n",
"[81116 rows x 3 columns]"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"ratings.to_csv('../data/reduced/ratings_m{}.csv'.format(n_ratings_movie), index_label='userId')"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"movies.to_csv('../data/reduced/movies_m{}.csv'.format(n_ratings_movie))"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"links.to_csv('../data/reduced/links_m{}.csv'.format(n_ratings_movie))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}