{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Filtros para reduzir o numero de instâncias dos conjuntos de dados\n", "motivação: \n", "\n", "Percebi que existem filmes com poucas avaliações, como podemos dizer se um filme é bom com apenas 1 ou 2 avaliações?\n", "\n", "Portanto, aqui reduzimos o conjunto de dados ao remover usuários e filmes com pouca \"relevância\" segundo os conceitos citados anteriormente." ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## filtro de numero de avaliações por usuário e filme" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### defina o limite minimo para permanecer" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "n_ratings_movie = 10" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdratingtimestamp
userId
114.0964982703
134.0964981247
164.0964982224
1475.0964983815
1505.0964982931
\n", "
" ], "text/plain": [ " movieId rating timestamp\n", "userId \n", "1 1 4.0 964982703\n", "1 3 4.0 964981247\n", "1 6 4.0 964982224\n", "1 47 5.0 964983815\n", "1 50 5.0 964982931" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings = pd.read_csv('../data/standard/ratings.csv', index_col='userId')\n", "ratings.head()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdratingtimestamp
count100836.000000100836.0000001.008360e+05
mean19435.2957183.5015571.205946e+09
std35530.9871991.0425292.162610e+08
min1.0000000.5000008.281246e+08
25%1199.0000003.0000001.019124e+09
50%2991.0000003.5000001.186087e+09
75%8122.0000004.0000001.435994e+09
max193609.0000005.0000001.537799e+09
\n", "
" ], "text/plain": [ " movieId rating timestamp\n", "count 100836.000000 100836.000000 1.008360e+05\n", "mean 19435.295718 3.501557 1.205946e+09\n", "std 35530.987199 1.042529 2.162610e+08\n", "min 1.000000 0.500000 8.281246e+08\n", "25% 1199.000000 3.000000 1.019124e+09\n", "50% 2991.000000 3.500000 1.186087e+09\n", "75% 8122.000000 4.000000 1.435994e+09\n", "max 193609.000000 5.000000 1.537799e+09" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.describe()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 610.000000\n", "mean 165.304918\n", "std 269.480584\n", "min 20.000000\n", "25% 35.000000\n", "50% 70.500000\n", "75% 168.000000\n", "max 2698.000000\n", "Name: count, dtype: float64" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.index.value_counts().describe()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 9724.000000\n", "mean 10.369807\n", "std 22.401005\n", "min 1.000000\n", "25% 1.000000\n", "50% 3.000000\n", "75% 9.000000\n", "max 329.000000\n", "Name: count, dtype: float64" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings['movieId'].value_counts().describe()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "def filter_n_ratings(df, column, n_ratings):\n", " low = df[column].value_counts() >= n_ratings\n", " low = low[low == True]\n", " return df[df[column].isin(low.index)]\n" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "ratings = filter_n_ratings(ratings, 'movieId', n_ratings_movie)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdratingtimestamp
count81116.00000081116.0000008.111600e+04
mean14857.1780783.5736781.197217e+09
std29539.3364121.0185902.167182e+08
min1.0000000.5000008.281246e+08
25%1007.0000003.0000001.001562e+09
50%2471.0000004.0000001.180447e+09
75%6016.0000004.0000001.431955e+09
max187593.0000005.0000001.537799e+09
\n", "
" ], "text/plain": [ " movieId rating timestamp\n", "count 81116.000000 81116.000000 8.111600e+04\n", "mean 14857.178078 3.573678 1.197217e+09\n", "std 29539.336412 1.018590 2.167182e+08\n", "min 1.000000 0.500000 8.281246e+08\n", "25% 1007.000000 3.000000 1.001562e+09\n", "50% 2471.000000 4.000000 1.180447e+09\n", "75% 6016.000000 4.000000 1.431955e+09\n", "max 187593.000000 5.000000 1.537799e+09" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.describe()" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2269" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(ratings['movieId'].value_counts())" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "610" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(ratings.index.value_counts())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## aplicando filtro no movies dataset" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlegenres
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
12Jumanji (1995)Adventure|Children|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama|Romance
45Father of the Bride Part II (1995)Comedy
\n", "
" ], "text/plain": [ " movieId title \\\n", "0 1 Toy Story (1995) \n", "1 2 Jumanji (1995) \n", "2 3 Grumpier Old Men (1995) \n", "3 4 Waiting to Exhale (1995) \n", "4 5 Father of the Bride Part II (1995) \n", "\n", " genres \n", "0 Adventure|Animation|Children|Comedy|Fantasy \n", "1 Adventure|Children|Fantasy \n", "2 Comedy|Romance \n", "3 Comedy|Drama|Romance \n", "4 Comedy " ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies = pd.read_csv('../data/standard/movies.csv')\n", "movies.head()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "movies = movies[movies['movieId'].isin(ratings['movieId'])]" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieId
count2269.000000
mean20530.586161
std35185.840333
min1.000000
25%1345.000000
50%3256.000000
75%8958.000000
max187593.000000
\n", "
" ], "text/plain": [ " movieId\n", "count 2269.000000\n", "mean 20530.586161\n", "std 35185.840333\n", "min 1.000000\n", "25% 1345.000000\n", "50% 3256.000000\n", "75% 8958.000000\n", "max 187593.000000" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.describe()" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 2269 entries, 0 to 9709\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 movieId 2269 non-null int64 \n", " 1 title 2269 non-null object\n", " 2 genres 2269 non-null object\n", "dtypes: int64(1), object(2)\n", "memory usage: 70.9+ KB\n" ] } ], "source": [ "movies.info()" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "movies.set_index('movieId', inplace=True)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegenres
movieId
1Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
2Jumanji (1995)Adventure|Children|Fantasy
3Grumpier Old Men (1995)Comedy|Romance
5Father of the Bride Part II (1995)Comedy
6Heat (1995)Action|Crime|Thriller
.........
174055Dunkirk (2017)Action|Drama|Thriller|War
176371Blade Runner 2049 (2017)Sci-Fi
177765Coco (2017)Adventure|Animation|Children
179819Star Wars: The Last Jedi (2017)Action|Adventure|Fantasy|Sci-Fi
187593Deadpool 2 (2018)Action|Comedy|Sci-Fi
\n", "

2269 rows × 2 columns

\n", "
" ], "text/plain": [ " title \\\n", "movieId \n", "1 Toy Story (1995) \n", "2 Jumanji (1995) \n", "3 Grumpier Old Men (1995) \n", "5 Father of the Bride Part II (1995) \n", "6 Heat (1995) \n", "... ... \n", "174055 Dunkirk (2017) \n", "176371 Blade Runner 2049 (2017) \n", "177765 Coco (2017) \n", "179819 Star Wars: The Last Jedi (2017) \n", "187593 Deadpool 2 (2018) \n", "\n", " genres \n", "movieId \n", "1 Adventure|Animation|Children|Comedy|Fantasy \n", "2 Adventure|Children|Fantasy \n", "3 Comedy|Romance \n", "5 Comedy \n", "6 Action|Crime|Thriller \n", "... ... \n", "174055 Action|Drama|Thriller|War \n", "176371 Sci-Fi \n", "177765 Adventure|Animation|Children \n", "179819 Action|Adventure|Fantasy|Sci-Fi \n", "187593 Action|Comedy|Sci-Fi \n", "\n", "[2269 rows x 2 columns]" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Aplicando filtros no conjuno de dados 'links'" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "links = pd.read_csv('../data/standard/links.csv', index_col='movieId')" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
imdbIdtmdbId
count9.742000e+039734.000000
mean6.771839e+0555162.123793
std1.107228e+0693653.481487
min4.170000e+022.000000
25%9.518075e+049665.500000
50%1.672605e+0516529.000000
75%8.055685e+0544205.750000
max8.391976e+06525662.000000
\n", "
" ], "text/plain": [ " imdbId tmdbId\n", "count 9.742000e+03 9734.000000\n", "mean 6.771839e+05 55162.123793\n", "std 1.107228e+06 93653.481487\n", "min 4.170000e+02 2.000000\n", "25% 9.518075e+04 9665.500000\n", "50% 1.672605e+05 16529.000000\n", "75% 8.055685e+05 44205.750000\n", "max 8.391976e+06 525662.000000" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "links.describe()" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "links = links[links.index.isin(movies.index)]" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
imdbIdtmdbId
count2.269000e+032269.000000
mean3.670981e+0519720.701190
std5.870238e+0549425.176137
min1.344200e+045.000000
25%9.965300e+041452.000000
50%1.199510e+059071.000000
75%3.471490e+0511566.000000
max5.463162e+06503475.000000
\n", "
" ], "text/plain": [ " imdbId tmdbId\n", "count 2.269000e+03 2269.000000\n", "mean 3.670981e+05 19720.701190\n", "std 5.870238e+05 49425.176137\n", "min 1.344200e+04 5.000000\n", "25% 9.965300e+04 1452.000000\n", "50% 1.199510e+05 9071.000000\n", "75% 3.471490e+05 11566.000000\n", "max 5.463162e+06 503475.000000" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "links.describe()" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdratingtimestamp
userId
114.0964982703
134.0964981247
164.0964982224
1475.0964983815
1505.0964982931
............
6101590933.01493847704
6101641795.01493845631
6101665284.01493879365
6101682505.01494273047
6101682525.01493846352
\n", "

81116 rows × 3 columns

\n", "
" ], "text/plain": [ " movieId rating timestamp\n", "userId \n", "1 1 4.0 964982703\n", "1 3 4.0 964981247\n", "1 6 4.0 964982224\n", "1 47 5.0 964983815\n", "1 50 5.0 964982931\n", "... ... ... ...\n", "610 159093 3.0 1493847704\n", "610 164179 5.0 1493845631\n", "610 166528 4.0 1493879365\n", "610 168250 5.0 1494273047\n", "610 168252 5.0 1493846352\n", "\n", "[81116 rows x 3 columns]" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "ratings.to_csv('../data/reduced/ratings_m{}.csv'.format(n_ratings_movie), index_label='userId')" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "movies.to_csv('../data/reduced/movies_m{}.csv'.format(n_ratings_movie))" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "links.to_csv('../data/reduced/links_m{}.csv'.format(n_ratings_movie))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }