{
"cells": [
{
"cell_type": "code",
"execution_count": 277,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from fuzzywuzzy import process"
]
},
{
"cell_type": "code",
"execution_count": 278,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" rating | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 4.0 | \n",
" 964982703 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 3 | \n",
" 4.0 | \n",
" 964981247 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 6 | \n",
" 4.0 | \n",
" 964982224 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 47 | \n",
" 5.0 | \n",
" 964983815 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 50 | \n",
" 5.0 | \n",
" 964982931 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 81111 | \n",
" 610 | \n",
" 159093 | \n",
" 3.0 | \n",
" 1493847704 | \n",
"
\n",
" \n",
" 81112 | \n",
" 610 | \n",
" 164179 | \n",
" 5.0 | \n",
" 1493845631 | \n",
"
\n",
" \n",
" 81113 | \n",
" 610 | \n",
" 166528 | \n",
" 4.0 | \n",
" 1493879365 | \n",
"
\n",
" \n",
" 81114 | \n",
" 610 | \n",
" 168250 | \n",
" 5.0 | \n",
" 1494273047 | \n",
"
\n",
" \n",
" 81115 | \n",
" 610 | \n",
" 168252 | \n",
" 5.0 | \n",
" 1493846352 | \n",
"
\n",
" \n",
"
\n",
"
81116 rows × 4 columns
\n",
"
"
],
"text/plain": [
" userId movieId rating timestamp\n",
"0 1 1 4.0 964982703\n",
"1 1 3 4.0 964981247\n",
"2 1 6 4.0 964982224\n",
"3 1 47 5.0 964983815\n",
"4 1 50 5.0 964982931\n",
"... ... ... ... ...\n",
"81111 610 159093 3.0 1493847704\n",
"81112 610 164179 5.0 1493845631\n",
"81113 610 166528 4.0 1493879365\n",
"81114 610 168250 5.0 1494273047\n",
"81115 610 168252 5.0 1493846352\n",
"\n",
"[81116 rows x 4 columns]"
]
},
"execution_count": 278,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings = pd.read_csv('../data/reduced/ratings_m10.csv')\n",
"ratings.reindex()"
]
},
{
"cell_type": "code",
"execution_count": 279,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" tmdbId | \n",
" imdbId | \n",
" cast | \n",
" director | \n",
" keywords | \n",
" overview | \n",
" title | \n",
" genres | \n",
" year | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 290 | \n",
" 290 | \n",
" 9100 | \n",
" 115963 | \n",
" ['Fairuza Balk', 'Neve Campbell', 'Robin Tunne... | \n",
" Andrew Fleming | \n",
" ['witch', 'suicide attempt', 'becoming an adul... | \n",
" A Catholic school newcomer falls in with a cli... | \n",
" Craft, The | \n",
" ['Drama', 'Fantasy', 'Horror', 'Thriller'] | \n",
" 1996 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 tmdbId imdbId \\\n",
"movieId \n",
"290 290 9100 115963 \n",
"\n",
" cast director \\\n",
"movieId \n",
"290 ['Fairuza Balk', 'Neve Campbell', 'Robin Tunne... Andrew Fleming \n",
"\n",
" keywords \\\n",
"movieId \n",
"290 ['witch', 'suicide attempt', 'becoming an adul... \n",
"\n",
" overview title \\\n",
"movieId \n",
"290 A Catholic school newcomer falls in with a cli... Craft, The \n",
"\n",
" genres year \n",
"movieId \n",
"290 ['Drama', 'Fantasy', 'Horror', 'Thriller'] 1996 "
]
},
"execution_count": 279,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId')\n",
"movies.sample()"
]
},
{
"cell_type": "code",
"execution_count": 280,
"metadata": {},
"outputs": [],
"source": [
"movies_title = movies[['title']]"
]
},
{
"cell_type": "code",
"execution_count": 281,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" rating | \n",
" timestamp | \n",
" title | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 4.0 | \n",
" 964982703 | \n",
" Toy Story | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" 1 | \n",
" 4.0 | \n",
" 847434962 | \n",
" Toy Story | \n",
"
\n",
" \n",
" 2 | \n",
" 7 | \n",
" 1 | \n",
" 4.5 | \n",
" 1106635946 | \n",
" Toy Story | \n",
"
\n",
" \n",
" 3 | \n",
" 15 | \n",
" 1 | \n",
" 2.5 | \n",
" 1510577970 | \n",
" Toy Story | \n",
"
\n",
" \n",
" 4 | \n",
" 17 | \n",
" 1 | \n",
" 4.5 | \n",
" 1305696483 | \n",
" Toy Story | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 35851 | \n",
" 546 | \n",
" 1327 | \n",
" 3.0 | \n",
" 973588711 | \n",
" Scooby-Doo | \n",
"
\n",
" \n",
" 35852 | \n",
" 555 | \n",
" 1327 | \n",
" 3.0 | \n",
" 978748648 | \n",
" Scooby-Doo | \n",
"
\n",
" \n",
" 35853 | \n",
" 571 | \n",
" 1327 | \n",
" 5.0 | \n",
" 966900601 | \n",
" Scooby-Doo | \n",
"
\n",
" \n",
" 35854 | \n",
" 600 | \n",
" 1327 | \n",
" 2.0 | \n",
" 1237710102 | \n",
" Scooby-Doo | \n",
"
\n",
" \n",
" 35855 | \n",
" 607 | \n",
" 1327 | \n",
" 5.0 | \n",
" 963079647 | \n",
" Scooby-Doo | \n",
"
\n",
" \n",
"
\n",
"
35856 rows × 5 columns
\n",
"
"
],
"text/plain": [
" userId movieId rating timestamp title\n",
"0 1 1 4.0 964982703 Toy Story \n",
"1 5 1 4.0 847434962 Toy Story \n",
"2 7 1 4.5 1106635946 Toy Story \n",
"3 15 1 2.5 1510577970 Toy Story \n",
"4 17 1 4.5 1305696483 Toy Story \n",
"... ... ... ... ... ...\n",
"35851 546 1327 3.0 973588711 Scooby-Doo \n",
"35852 555 1327 3.0 978748648 Scooby-Doo \n",
"35853 571 1327 5.0 966900601 Scooby-Doo \n",
"35854 600 1327 2.0 1237710102 Scooby-Doo \n",
"35855 607 1327 5.0 963079647 Scooby-Doo \n",
"\n",
"[35856 rows x 5 columns]"
]
},
"execution_count": 281,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rating_movie = ratings.merge(movies_title, on='movieId')\n",
"rating_movie"
]
},
{
"cell_type": "code",
"execution_count": 282,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" rating | \n",
" timestamp | \n",
" title | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 4.0 | \n",
" 964982703 | \n",
" Toy Story | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" 1 | \n",
" 4.0 | \n",
" 847434962 | \n",
" Toy Story | \n",
"
\n",
" \n",
" 2 | \n",
" 7 | \n",
" 1 | \n",
" 4.5 | \n",
" 1106635946 | \n",
" Toy Story | \n",
"
\n",
" \n",
" 3 | \n",
" 15 | \n",
" 1 | \n",
" 2.5 | \n",
" 1510577970 | \n",
" Toy Story | \n",
"
\n",
" \n",
" 4 | \n",
" 17 | \n",
" 1 | \n",
" 4.5 | \n",
" 1305696483 | \n",
" Toy Story | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 35851 | \n",
" 546 | \n",
" 1327 | \n",
" 3.0 | \n",
" 973588711 | \n",
" Scooby-Doo | \n",
"
\n",
" \n",
" 35852 | \n",
" 555 | \n",
" 1327 | \n",
" 3.0 | \n",
" 978748648 | \n",
" Scooby-Doo | \n",
"
\n",
" \n",
" 35853 | \n",
" 571 | \n",
" 1327 | \n",
" 5.0 | \n",
" 966900601 | \n",
" Scooby-Doo | \n",
"
\n",
" \n",
" 35854 | \n",
" 600 | \n",
" 1327 | \n",
" 2.0 | \n",
" 1237710102 | \n",
" Scooby-Doo | \n",
"
\n",
" \n",
" 35855 | \n",
" 607 | \n",
" 1327 | \n",
" 5.0 | \n",
" 963079647 | \n",
" Scooby-Doo | \n",
"
\n",
" \n",
"
\n",
"
35856 rows × 5 columns
\n",
"
"
],
"text/plain": [
" userId movieId rating timestamp title\n",
"0 1 1 4.0 964982703 Toy Story \n",
"1 5 1 4.0 847434962 Toy Story \n",
"2 7 1 4.5 1106635946 Toy Story \n",
"3 15 1 2.5 1510577970 Toy Story \n",
"4 17 1 4.5 1305696483 Toy Story \n",
"... ... ... ... ... ...\n",
"35851 546 1327 3.0 973588711 Scooby-Doo \n",
"35852 555 1327 3.0 978748648 Scooby-Doo \n",
"35853 571 1327 5.0 966900601 Scooby-Doo \n",
"35854 600 1327 2.0 1237710102 Scooby-Doo \n",
"35855 607 1327 5.0 963079647 Scooby-Doo \n",
"\n",
"[35856 rows x 5 columns]"
]
},
"execution_count": 282,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rating_movie"
]
},
{
"cell_type": "code",
"execution_count": 283,
"metadata": {},
"outputs": [],
"source": [
"def train_test_column_split(df: pd.DataFrame, group_column: str, split_column: str, y_label: str, train_size: float):\n",
" df = df.sort_values(by=split_column, ascending=True) \n",
" train = pd.DataFrame(columns=df.columns)\n",
" test = pd.DataFrame(columns=df.columns)\n",
"\n",
" for idx in df[group_column].unique():\n",
" group = df.loc[df[group_column] == idx]\n",
"\n",
" q_user = group[group[split_column].le(group[split_column].quantile(train_size))]\n",
" p_user = group[group[split_column].ge(group[split_column].quantile(train_size))]\n",
"\n",
" train = pd.concat([train, q_user])\n",
" test = pd.concat([test, p_user])\n",
" train = train.sort_index(ascending=True)\n",
" test = test.sort_index(ascending=True)\n",
"\n",
" X_labels = [c for c in df.columns if c != y_label]\n",
"\n",
" X_train = train[X_labels]\n",
" X_test = test[X_labels]\n",
" y_train = train[y_label]\n",
" y_test = test[y_label]\n",
"\n",
" return (X_train, X_test, y_train, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 284,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_column_split(rating_movie, 'userId', 'timestamp', 'rating', .8)"
]
},
{
"cell_type": "code",
"execution_count": 285,
"metadata": {},
"outputs": [],
"source": [
"train = pd.concat([X_train, y_train], axis=1)\n",
"test = pd.concat([X_test, y_test], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 286,
"metadata": {},
"outputs": [],
"source": [
"user_movie_mat = rating_movie.pivot(index='movieId', columns='userId', values='rating').fillna(0)\n",
"user_movie_mat_train = train.pivot(index='movieId', columns='userId', values='rating').fillna(0)\n",
"user_movie_mat_test = test.pivot(index='movieId', columns='userId', values='rating').fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 287,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" userId | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" ... | \n",
" 601 | \n",
" 602 | \n",
" 603 | \n",
" 604 | \n",
" 605 | \n",
" 606 | \n",
" 607 | \n",
" 608 | \n",
" 609 | \n",
" 610 | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.5 | \n",
" 0.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2018 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2019 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 4.5 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2020 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2021 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2023 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
766 rows × 608 columns
\n",
"
"
],
"text/plain": [
"userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 603 \\\n",
"movieId ... \n",
"1 4.0 0.0 0.0 0.0 4.0 0.0 4.5 0.0 0.0 0.0 ... 4.0 0.0 4.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 4.0 0.0 \n",
"3 4.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"5 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"6 4.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 4.0 \n",
"... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n",
"2018 5.0 0.0 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2019 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 4.5 0.0 5.0 \n",
"2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 4.0 \n",
"2021 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 ... 0.0 0.0 2.0 \n",
"\n",
"userId 604 605 606 607 608 609 610 \n",
"movieId \n",
"1 3.0 4.0 0.0 0.0 2.5 0.0 5.0 \n",
"2 5.0 0.0 0.0 0.0 2.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 2.0 0.0 0.0 \n",
"5 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"6 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... ... ... ... \n",
"2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2019 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2020 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n",
"2021 0.0 0.0 0.0 0.0 3.5 0.0 0.0 \n",
"2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
"[766 rows x 608 columns]"
]
},
"execution_count": 287,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user_movie_mat_train"
]
},
{
"cell_type": "code",
"execution_count": 288,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" userId | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" ... | \n",
" 601 | \n",
" 602 | \n",
" 603 | \n",
" 604 | \n",
" 605 | \n",
" 606 | \n",
" 607 | \n",
" 608 | \n",
" 609 | \n",
" 610 | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.5 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2018 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2019 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2020 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2021 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2023 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
763 rows × 608 columns
\n",
"
"
],
"text/plain": [
"userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 603 \\\n",
"movieId ... \n",
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n",
"2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2019 0.0 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2021 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"\n",
"userId 604 605 606 607 608 609 610 \n",
"movieId \n",
"1 0.0 0.0 2.5 4.0 0.0 3.0 0.0 \n",
"2 0.0 3.5 0.0 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"6 0.0 0.0 0.0 0.0 0.0 0.0 5.0 \n",
"... ... ... ... ... ... ... ... \n",
"2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2019 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n",
"2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2021 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n",
"2023 0.0 0.0 0.0 0.0 4.5 0.0 0.0 \n",
"\n",
"[763 rows x 608 columns]"
]
},
"execution_count": 288,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user_movie_mat_test"
]
},
{
"cell_type": "code",
"execution_count": 289,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" userId | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" ... | \n",
" 601 | \n",
" 602 | \n",
" 603 | \n",
" 604 | \n",
" 605 | \n",
" 606 | \n",
" 607 | \n",
" 608 | \n",
" 609 | \n",
" 610 | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 2.5 | \n",
" 4.0 | \n",
" 2.5 | \n",
" 3.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 3.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2018 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2019 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 4.5 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2020 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2021 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 3.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2023 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
766 rows × 608 columns
\n",
"
"
],
"text/plain": [
"userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 603 \\\n",
"movieId ... \n",
"1 4.0 0.0 0.0 0.0 4.0 0.0 4.5 0.0 0.0 0.0 ... 4.0 0.0 4.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 4.0 0.0 4.0 0.0 0.0 ... 0.0 4.0 0.0 \n",
"3 4.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"5 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"6 4.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 4.0 \n",
"... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n",
"2018 5.0 0.0 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2019 0.0 0.0 0.0 2.0 0.0 0.0 5.0 0.0 0.0 0.0 ... 4.5 0.0 5.0 \n",
"2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 4.0 \n",
"2021 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 ... 0.0 0.0 2.0 \n",
"\n",
"userId 604 605 606 607 608 609 610 \n",
"movieId \n",
"1 3.0 4.0 2.5 4.0 2.5 3.0 5.0 \n",
"2 5.0 3.5 0.0 0.0 2.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 2.0 0.0 0.0 \n",
"5 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"6 3.0 0.0 0.0 0.0 0.0 0.0 5.0 \n",
"... ... ... ... ... ... ... ... \n",
"2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2019 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n",
"2020 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n",
"2021 0.0 0.0 4.0 0.0 3.5 0.0 0.0 \n",
"2023 0.0 0.0 0.0 0.0 4.5 0.0 0.0 \n",
"\n",
"[766 rows x 608 columns]"
]
},
"execution_count": 289,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user_movie_mat"
]
},
{
"cell_type": "code",
"execution_count": 290,
"metadata": {},
"outputs": [],
"source": [
"def find_correlation_between_two_users(ratings_df: pd.DataFrame, user1: str, user2: str):\n",
" \"\"\"Find correlation between two users based on their rated movies using Pearson correlation\"\"\"\n",
" rated_movies_by_both = ratings_df[[user1, user2]].dropna(axis=0).values\n",
" user1_ratings = rated_movies_by_both[:, 0]\n",
" user2_ratings = rated_movies_by_both[:, 1]\n",
" return np.corrcoef(user1_ratings, user2_ratings)[0, 1]"
]
},
{
"cell_type": "code",
"execution_count": 291,
"metadata": {},
"outputs": [],
"source": [
"users_list = list(user_movie_mat.columns)\n",
"movies_list = list(user_movie_mat.index)\n",
"\n",
"#users_similarity_mat = np.array([[corr_between_users(user_movie_mat, user1, user2) for user1 in users_list] for user2 in users_list])\n",
"#users_similarity_mat = pd.DataFrame(users_similarity_mat, index=users_list, columns=users_list)\n",
"users_similarity_mat = pd.read_pickle('../data/preprocessed/users_similarity_mat.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 292,
"metadata": {},
"outputs": [],
"source": [
"def get_rated_user_for_a_movie(ratings_df: pd.DataFrame, movie: str):\n",
" return ratings_df.loc[movie, :].dropna().index.values\n",
"\n",
"\n",
"def get_top_neighbors(\n",
" similarity_df: pd.DataFrame, user: str, rated_users: str, n_neighbors: int\n",
"):\n",
" return similarity_df[user][rated_users].nlargest(n_neighbors).to_dict()\n",
"\n",
"\n",
"def subtract_bias(rating: float, mean_rating: float):\n",
" return rating - mean_rating\n",
"\n",
"\n",
"def get_neighbor_rating_without_bias_per_movie(\n",
" ratings_df: pd.DataFrame, user: str, movie: str\n",
"):\n",
" \"\"\"Substract the rating of a user from the mean rating of that user to eliminate bias\"\"\"\n",
" mean_rating = ratings_df[user].mean()\n",
" rating = ratings_df.loc[movie, user]\n",
" return subtract_bias(rating, mean_rating)\n",
"\n",
"\n",
"def get_ratings_of_neighbors(ratings_df: pd.DataFrame, neighbors: list, movie: str):\n",
" \"\"\"Get the ratings of all neighbors after adjusting for biases\"\"\"\n",
" return [\n",
" get_neighbor_rating_without_bias_per_movie(ratings_df, neighbor, movie)\n",
" for neighbor in neighbors\n",
" ]\n",
"\n",
"def get_weighted_average_rating_of_neighbors(ratings: list, neighbor_distance: list):\n",
" weighted_sum = np.array(ratings).dot(np.array(neighbor_distance))\n",
" abs_neigbor_distance = np.abs(neighbor_distance)\n",
" return weighted_sum / np.sum(abs_neigbor_distance)\n",
"\n",
"\n",
"def ger_user_rating(ratings_df: pd.DataFrame, user: str, avg_neighbor_rating: float):\n",
" user_avg_rating = ratings_df[user].mean()\n",
" return round(user_avg_rating + avg_neighbor_rating, 2)\n"
]
},
{
"cell_type": "code",
"execution_count": 293,
"metadata": {},
"outputs": [],
"source": [
"def predict_rating(\n",
" df: pd.DataFrame,\n",
" similarity_df: pd.DataFrame,\n",
" user: str,\n",
" movie: str,\n",
" n_neighbors: int = 2,\n",
"):\n",
" \"\"\"Predict the rating of a user for a movie based on the ratings of neighbors\"\"\"\n",
" ratings_df = df.copy()\n",
"\n",
" rated_users = get_rated_user_for_a_movie(ratings_df, movie)\n",
"\n",
" top_neighbors_distance = get_top_neighbors(\n",
" similarity_df, user, rated_users, n_neighbors\n",
" )\n",
" neighbors, distance = top_neighbors_distance.keys(), top_neighbors_distance.values()\n",
"\n",
" #print(f\"Top {n_neighbors} neighbors of user {user}, {movie}: {list(neighbors)}, distance: {list(distance)}\")\n",
"\n",
" ratings = get_ratings_of_neighbors(ratings_df, neighbors, movie)\n",
" avg_neighbor_rating = get_weighted_average_rating_of_neighbors(\n",
" ratings, list(distance)\n",
" )\n",
"\n",
" return ger_user_rating(ratings_df, user, avg_neighbor_rating)"
]
},
{
"cell_type": "code",
"execution_count": 294,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3.02"
]
},
"execution_count": 294,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movie_name = 'Heartbreakers'\n",
"user1 = 1\n",
"movie = process.extractOne(movie_name, movies['title'])[2]\n",
"rating = predict_rating(user_movie_mat, users_similarity_mat, user1, movie, 10)\n",
"rating"
]
},
{
"cell_type": "code",
"execution_count": 295,
"metadata": {},
"outputs": [],
"source": [
"def get_n_recommendations(user: int, n: int, user_movie_mat: pd.DataFrame, movies: pd.DataFrame):\n",
" full_ratings = user_movie_mat.copy()\n",
" recommendations = pd.DataFrame(columns=['movieId', 'title', 'rating'])\n",
"\n",
" for movie, _ in full_ratings[user].items():\n",
" if np.isnan(full_ratings.loc[movie, user]) or full_ratings.loc[movie, user] == 0:\n",
" full_ratings.loc[movie, user] = predict_rating(user_movie_mat, users_similarity_mat, user, movie, 10)\n",
" new_row = {'movieId': movie, 'title': movies.loc[movie]['title'], 'rating': full_ratings.loc[movie, user]}\n",
" recommendations.loc[len(recommendations)] = new_row\n",
"\n",
" recommendations = recommendations.sort_values(by='rating', ascending=False)\n",
" return recommendations.head(n) if n > 0 else recommendations"
]
},
{
"cell_type": "code",
"execution_count": 296,
"metadata": {},
"outputs": [],
"source": [
"full_ratings = pd.read_csv('../data/preprocessed/full_ratings_comp.csv', index_col='movieId')"
]
},
{
"cell_type": "code",
"execution_count": 310,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
" title | \n",
" rating | \n",
"
\n",
" \n",
" \n",
" \n",
" 403 | \n",
" 1200 | \n",
" Heartbreakers | \n",
" 3.02 | \n",
"
\n",
" \n",
" 303 | \n",
" 858 | \n",
" South Park: Bigger, Longer and Uncut | \n",
" 2.75 | \n",
"
\n",
" \n",
" 243 | \n",
" 589 | \n",
" Fallen | \n",
" 2.58 | \n",
"
\n",
" \n",
" 232 | \n",
" 541 | \n",
" George of the Jungle | \n",
" 2.51 | \n",
"
\n",
" \n",
" 494 | \n",
" 1374 | \n",
" Final Destination 2 | \n",
" 2.46 | \n",
"
\n",
" \n",
" 415 | \n",
" 1221 | \n",
" Evolution | \n",
" 2.42 | \n",
"
\n",
" \n",
" 326 | \n",
" 924 | \n",
" Goldfinger | \n",
" 2.40 | \n",
"
\n",
" \n",
" 549 | \n",
" 1610 | \n",
" Hard Candy | \n",
" 2.16 | \n",
"
\n",
" \n",
" 357 | \n",
" 1036 | \n",
" Great Muppet Caper, The | \n",
" 2.13 | \n",
"
\n",
" \n",
" 384 | \n",
" 1129 | \n",
" Hollow Man | \n",
" 2.09 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movieId title rating\n",
"403 1200 Heartbreakers 3.02\n",
"303 858 South Park: Bigger, Longer and Uncut 2.75\n",
"243 589 Fallen 2.58\n",
"232 541 George of the Jungle 2.51\n",
"494 1374 Final Destination 2 2.46\n",
"415 1221 Evolution 2.42\n",
"326 924 Goldfinger 2.40\n",
"549 1610 Hard Candy 2.16\n",
"357 1036 Great Muppet Caper, The 2.13\n",
"384 1129 Hollow Man 2.09"
]
},
"execution_count": 310,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user_id = 1\n",
"n_recommendations = 10\n",
"\n",
"get_n_recommendations(user_id, n_recommendations, user_movie_mat, movies)"
]
},
{
"cell_type": "code",
"execution_count": 298,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" userId | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" ... | \n",
" 601 | \n",
" 602 | \n",
" 603 | \n",
" 604 | \n",
" 605 | \n",
" 606 | \n",
" 607 | \n",
" 608 | \n",
" 609 | \n",
" 610 | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 2.5 | \n",
" 4.0 | \n",
" 2.5 | \n",
" 3.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 3.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2018 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2019 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 4.5 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2020 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2021 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 3.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2023 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
766 rows × 608 columns
\n",
"
"
],
"text/plain": [
"userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 603 \\\n",
"movieId ... \n",
"1 4.0 0.0 0.0 0.0 4.0 0.0 4.5 0.0 0.0 0.0 ... 4.0 0.0 4.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 4.0 0.0 4.0 0.0 0.0 ... 0.0 4.0 0.0 \n",
"3 4.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"5 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"6 4.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 4.0 \n",
"... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n",
"2018 5.0 0.0 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2019 0.0 0.0 0.0 2.0 0.0 0.0 5.0 0.0 0.0 0.0 ... 4.5 0.0 5.0 \n",
"2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 4.0 \n",
"2021 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 ... 0.0 0.0 2.0 \n",
"\n",
"userId 604 605 606 607 608 609 610 \n",
"movieId \n",
"1 3.0 4.0 2.5 4.0 2.5 3.0 5.0 \n",
"2 5.0 3.5 0.0 0.0 2.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 2.0 0.0 0.0 \n",
"5 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"6 3.0 0.0 0.0 0.0 0.0 0.0 5.0 \n",
"... ... ... ... ... ... ... ... \n",
"2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2019 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n",
"2020 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n",
"2021 0.0 0.0 4.0 0.0 3.5 0.0 0.0 \n",
"2023 0.0 0.0 0.0 0.0 4.5 0.0 0.0 \n",
"\n",
"[766 rows x 608 columns]"
]
},
"execution_count": 298,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user_movie_mat"
]
},
{
"cell_type": "code",
"execution_count": 299,
"metadata": {},
"outputs": [],
"source": [
"def store_ratings(user: int, n: int, user_movie_mat: pd.DataFrame, movies: pd.DataFrame):\n",
" full_ratings = user_movie_mat.copy()\n",
"\n",
" for movie, _ in user_movie_mat[user].items():\n",
" if np.isnan(user_movie_mat.loc[movie, user]) or user_movie_mat.loc[movie, user] == 0:\n",
" user_movie_mat.loc[movie, user] = predict_rating(user_movie_mat, users_similarity_mat, user, movie, 100)"
]
},
{
"cell_type": "code",
"execution_count": 300,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
" title | \n",
" rating | \n",
"
\n",
" \n",
" \n",
" \n",
" 438 | \n",
" 1240 | \n",
" Dirty Rotten Scoundrels | \n",
" 2.98 | \n",
"
\n",
" \n",
" 413 | \n",
" 1200 | \n",
" Heartbreakers | \n",
" 2.81 | \n",
"
\n",
" \n",
" 311 | \n",
" 858 | \n",
" South Park: Bigger, Longer and Uncut | \n",
" 2.78 | \n",
"
\n",
" \n",
" 237 | \n",
" 541 | \n",
" George of the Jungle | \n",
" 2.53 | \n",
"
\n",
" \n",
" 426 | \n",
" 1221 | \n",
" Evolution | \n",
" 2.44 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 389 | \n",
" 1103 | \n",
" Road Warrior, The (Mad Max 2) | \n",
" -0.12 | \n",
"
\n",
" \n",
" 96 | \n",
" 207 | \n",
" Free Willy | \n",
" -0.12 | \n",
"
\n",
" \n",
" 386 | \n",
" 1096 | \n",
" Flatliners | \n",
" -0.12 | \n",
"
\n",
" \n",
" 385 | \n",
" 1095 | \n",
" Blood Simple | \n",
" -0.12 | \n",
"
\n",
" \n",
" 90 | \n",
" 194 | \n",
" Black Beauty | \n",
" -0.12 | \n",
"
\n",
" \n",
"
\n",
"
676 rows × 3 columns
\n",
"
"
],
"text/plain": [
" movieId title rating\n",
"438 1240 Dirty Rotten Scoundrels 2.98\n",
"413 1200 Heartbreakers 2.81\n",
"311 858 South Park: Bigger, Longer and Uncut 2.78\n",
"237 541 George of the Jungle 2.53\n",
"426 1221 Evolution 2.44\n",
".. ... ... ...\n",
"389 1103 Road Warrior, The (Mad Max 2) -0.12\n",
"96 207 Free Willy -0.12\n",
"386 1096 Flatliners -0.12\n",
"385 1095 Blood Simple -0.12\n",
"90 194 Black Beauty -0.12\n",
"\n",
"[676 rows x 3 columns]"
]
},
"execution_count": 300,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_n_recommendations(user_id, n_recommendations, user_movie_mat_train, movies)"
]
},
{
"cell_type": "code",
"execution_count": 301,
"metadata": {},
"outputs": [],
"source": [
"from math import sqrt"
]
},
{
"cell_type": "code",
"execution_count": 302,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" userId | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" ... | \n",
" 601 | \n",
" 602 | \n",
" 603 | \n",
" 604 | \n",
" 605 | \n",
" 606 | \n",
" 607 | \n",
" 608 | \n",
" 609 | \n",
" 610 | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 2.5 | \n",
" 4.0 | \n",
" 2.5 | \n",
" 3.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 3.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2018 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2019 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 4.5 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2020 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2021 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 3.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2023 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
766 rows × 608 columns
\n",
"
"
],
"text/plain": [
"userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 603 \\\n",
"movieId ... \n",
"1 4.0 0.0 0.0 0.0 4.0 0.0 4.5 0.0 0.0 0.0 ... 4.0 0.0 4.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 4.0 0.0 4.0 0.0 0.0 ... 0.0 4.0 0.0 \n",
"3 4.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"5 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"6 4.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 4.0 \n",
"... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n",
"2018 5.0 0.0 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2019 0.0 0.0 0.0 2.0 0.0 0.0 5.0 0.0 0.0 0.0 ... 4.5 0.0 5.0 \n",
"2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 4.0 \n",
"2021 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 ... 0.0 0.0 2.0 \n",
"\n",
"userId 604 605 606 607 608 609 610 \n",
"movieId \n",
"1 3.0 4.0 2.5 4.0 2.5 3.0 5.0 \n",
"2 5.0 3.5 0.0 0.0 2.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 2.0 0.0 0.0 \n",
"5 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"6 3.0 0.0 0.0 0.0 0.0 0.0 5.0 \n",
"... ... ... ... ... ... ... ... \n",
"2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2019 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n",
"2020 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n",
"2021 0.0 0.0 4.0 0.0 3.5 0.0 0.0 \n",
"2023 0.0 0.0 0.0 0.0 4.5 0.0 0.0 \n",
"\n",
"[766 rows x 608 columns]"
]
},
"execution_count": 302,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user_movie_mat"
]
},
{
"cell_type": "code",
"execution_count": 303,
"metadata": {},
"outputs": [],
"source": [
"user = 1\n",
"SSE = 0\n",
"c = 0"
]
},
{
"cell_type": "code",
"execution_count": 304,
"metadata": {},
"outputs": [],
"source": [
"for movie, _ in user_movie_mat_train[user].items():\n",
" if np.isnan(user_movie_mat_train.loc[movie, user]) or user_movie_mat_train.loc[movie, user] == 0:\n",
" user_movie_mat_train.loc[movie, user] = predict_rating(user_movie_mat_train, users_similarity_mat, user, movie, 100)"
]
},
{
"cell_type": "code",
"execution_count": 305,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" userId | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" ... | \n",
" 601 | \n",
" 602 | \n",
" 603 | \n",
" 604 | \n",
" 605 | \n",
" 606 | \n",
" 607 | \n",
" 608 | \n",
" 609 | \n",
" 610 | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 4.00 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 4.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.5 | \n",
" 0.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.90 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.00 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.12 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 4.00 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2018 | \n",
" 5.00 | \n",
" 0.0 | \n",
" 0.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2019 | \n",
" 1.21 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 4.5 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2020 | \n",
" 0.76 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2021 | \n",
" 1.08 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2023 | \n",
" 1.14 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
766 rows × 608 columns
\n",
"
"
],
"text/plain": [
"userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 \\\n",
"movieId ... \n",
"1 4.00 0.0 0.0 0.0 4.0 0.0 4.5 0.0 0.0 0.0 ... 4.0 0.0 \n",
"2 0.90 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 4.0 \n",
"3 4.00 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"5 0.12 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"6 4.00 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 \n",
"... ... ... ... ... ... ... ... ... ... ... ... ... ... \n",
"2018 5.00 0.0 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"2019 1.21 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 4.5 0.0 \n",
"2020 0.76 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"2021 1.08 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"2023 1.14 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 ... 0.0 0.0 \n",
"\n",
"userId 603 604 605 606 607 608 609 610 \n",
"movieId \n",
"1 4.0 3.0 4.0 0.0 0.0 2.5 0.0 5.0 \n",
"2 0.0 5.0 0.0 0.0 0.0 2.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 \n",
"5 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"6 4.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... ... ... ... ... \n",
"2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2019 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2020 4.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n",
"2021 0.0 0.0 0.0 0.0 0.0 3.5 0.0 0.0 \n",
"2023 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
"[766 rows x 608 columns]"
]
},
"execution_count": 305,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user_movie_mat_train"
]
},
{
"cell_type": "code",
"execution_count": 306,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.1835473787967214\n"
]
}
],
"source": [
"\n",
"for movie, _ in user_movie_mat_test[user].items():\n",
" if user_movie_mat_test.loc[movie, user] != 0:\n",
" #print(user_movie_mat_test.loc[movie, user], user_movie_mat_train.loc[movie, user])\n",
" E = user_movie_mat_test.loc[movie, user] - user_movie_mat_train.loc[movie, user]\n",
" SSE = SSE + pow(E, 2)\n",
" c = c+1\n",
"MSE = SSE/c\n",
"RMSE = sqrt(MSE)\n",
"print(RMSE)"
]
},
{
"cell_type": "code",
"execution_count": 307,
"metadata": {},
"outputs": [],
"source": [
"def ger_full_ratings():\n",
" full_ratings = user_movie_mat.copy()\n",
"\n",
" for user, movies in full_ratings.items():\n",
" for movie in movies.keys():\n",
" if np.isnan(full_ratings.loc[movie, user]) or full_ratings.loc[movie, user] == 0:\n",
" full_ratings.loc[movie, user] = predict_rating(\n",
" user_movie_mat, users_similarity_mat, user, movie\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 308,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" ... | \n",
" 601 | \n",
" 602 | \n",
" 603 | \n",
" 604 | \n",
" 605 | \n",
" 606 | \n",
" 607 | \n",
" 608 | \n",
" 609 | \n",
" 610 | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 4.00 | \n",
" -0.0 | \n",
" -0.0 | \n",
" 0.78 | \n",
" 4.00 | \n",
" 0.05 | \n",
" 4.50 | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" ... | \n",
" 4.00 | \n",
" 0.03 | \n",
" 4.00 | \n",
" 3.0 | \n",
" 4.00 | \n",
" 2.50 | \n",
" 4.00 | \n",
" 2.50 | \n",
" 3.00 | \n",
" 5.00 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.03 | \n",
" -0.0 | \n",
" -0.0 | \n",
" -0.13 | \n",
" 1.04 | \n",
" 4.00 | \n",
" 0.04 | \n",
" 4.00 | \n",
" 0.01 | \n",
" 0.02 | \n",
" ... | \n",
" -0.03 | \n",
" 4.00 | \n",
" 0.04 | \n",
" 5.0 | \n",
" 3.50 | \n",
" 0.65 | \n",
" 0.87 | \n",
" 2.00 | \n",
" -0.01 | \n",
" 1.23 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.00 | \n",
" -0.0 | \n",
" -0.0 | \n",
" -0.13 | \n",
" 1.04 | \n",
" 5.00 | \n",
" 0.04 | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" ... | \n",
" -0.03 | \n",
" 0.03 | \n",
" 0.04 | \n",
" 0.0 | \n",
" 0.05 | \n",
" -0.17 | \n",
" -0.00 | \n",
" 2.00 | \n",
" -0.01 | \n",
" -0.02 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.03 | \n",
" -0.0 | \n",
" -0.0 | \n",
" -0.13 | \n",
" 1.04 | \n",
" 5.00 | \n",
" 0.04 | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" ... | \n",
" -0.03 | \n",
" 0.03 | \n",
" 0.04 | \n",
" 3.0 | \n",
" 0.05 | \n",
" 0.24 | \n",
" -0.00 | \n",
" 0.19 | \n",
" -0.01 | \n",
" -0.02 | \n",
"
\n",
" \n",
" 6 | \n",
" 4.00 | \n",
" -0.0 | \n",
" -0.0 | \n",
" 0.78 | \n",
" 1.04 | \n",
" 4.00 | \n",
" 0.04 | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" ... | \n",
" -0.03 | \n",
" 3.00 | \n",
" 4.00 | \n",
" 3.0 | \n",
" 0.05 | \n",
" 0.65 | \n",
" -0.00 | \n",
" 1.39 | \n",
" -0.01 | \n",
" 5.00 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 139385 | \n",
" 0.03 | \n",
" -0.0 | \n",
" -0.0 | \n",
" -0.13 | \n",
" -0.02 | \n",
" 0.05 | \n",
" 0.04 | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" ... | \n",
" 1.79 | \n",
" 0.03 | \n",
" 0.04 | \n",
" 0.0 | \n",
" 0.05 | \n",
" -0.17 | \n",
" -0.00 | \n",
" 0.19 | \n",
" -0.01 | \n",
" 4.50 | \n",
"
\n",
" \n",
" 139644 | \n",
" 0.03 | \n",
" -0.0 | \n",
" -0.0 | \n",
" -0.13 | \n",
" -0.02 | \n",
" 0.05 | \n",
" 0.04 | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" ... | \n",
" -0.03 | \n",
" 0.03 | \n",
" 0.04 | \n",
" 0.0 | \n",
" 0.05 | \n",
" -0.17 | \n",
" -0.00 | \n",
" 0.19 | \n",
" -0.01 | \n",
" 4.50 | \n",
"
\n",
" \n",
" 140110 | \n",
" 0.03 | \n",
" -0.0 | \n",
" -0.0 | \n",
" -0.13 | \n",
" -0.02 | \n",
" 0.05 | \n",
" 0.04 | \n",
" 0.01 | \n",
" 0.01 | \n",
" 5.00 | \n",
" ... | \n",
" -0.03 | \n",
" 0.03 | \n",
" 0.04 | \n",
" 0.0 | \n",
" 0.05 | \n",
" -0.17 | \n",
" -0.00 | \n",
" 0.19 | \n",
" -0.01 | \n",
" -0.02 | \n",
"
\n",
" \n",
" 142488 | \n",
" 0.03 | \n",
" -0.0 | \n",
" -0.0 | \n",
" -0.13 | \n",
" -0.02 | \n",
" 0.05 | \n",
" 0.04 | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" ... | \n",
" 1.79 | \n",
" 0.03 | \n",
" 0.04 | \n",
" 0.0 | \n",
" 0.05 | \n",
" -0.17 | \n",
" -0.00 | \n",
" 0.19 | \n",
" -0.01 | \n",
" 3.50 | \n",
"
\n",
" \n",
" 148626 | \n",
" 0.03 | \n",
" -0.0 | \n",
" -0.0 | \n",
" -0.13 | \n",
" -0.02 | \n",
" 0.05 | \n",
" 0.04 | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" ... | \n",
" -0.03 | \n",
" 0.03 | \n",
" 0.04 | \n",
" 0.0 | \n",
" 0.05 | \n",
" -0.17 | \n",
" -0.00 | \n",
" 0.19 | \n",
" -0.01 | \n",
" 4.00 | \n",
"
\n",
" \n",
"
\n",
"
2026 rows × 610 columns
\n",
"
"
],
"text/plain": [
" 1 2 3 4 5 6 7 8 9 10 ... 601 \\\n",
"movieId ... \n",
"1 4.00 -0.0 -0.0 0.78 4.00 0.05 4.50 0.01 0.01 0.02 ... 4.00 \n",
"2 0.03 -0.0 -0.0 -0.13 1.04 4.00 0.04 4.00 0.01 0.02 ... -0.03 \n",
"3 4.00 -0.0 -0.0 -0.13 1.04 5.00 0.04 0.01 0.01 0.02 ... -0.03 \n",
"5 0.03 -0.0 -0.0 -0.13 1.04 5.00 0.04 0.01 0.01 0.02 ... -0.03 \n",
"6 4.00 -0.0 -0.0 0.78 1.04 4.00 0.04 0.01 0.01 0.02 ... -0.03 \n",
"... ... ... ... ... ... ... ... ... ... ... ... ... \n",
"139385 0.03 -0.0 -0.0 -0.13 -0.02 0.05 0.04 0.01 0.01 0.02 ... 1.79 \n",
"139644 0.03 -0.0 -0.0 -0.13 -0.02 0.05 0.04 0.01 0.01 0.02 ... -0.03 \n",
"140110 0.03 -0.0 -0.0 -0.13 -0.02 0.05 0.04 0.01 0.01 5.00 ... -0.03 \n",
"142488 0.03 -0.0 -0.0 -0.13 -0.02 0.05 0.04 0.01 0.01 0.02 ... 1.79 \n",
"148626 0.03 -0.0 -0.0 -0.13 -0.02 0.05 0.04 0.01 0.01 0.02 ... -0.03 \n",
"\n",
" 602 603 604 605 606 607 608 609 610 \n",
"movieId \n",
"1 0.03 4.00 3.0 4.00 2.50 4.00 2.50 3.00 5.00 \n",
"2 4.00 0.04 5.0 3.50 0.65 0.87 2.00 -0.01 1.23 \n",
"3 0.03 0.04 0.0 0.05 -0.17 -0.00 2.00 -0.01 -0.02 \n",
"5 0.03 0.04 3.0 0.05 0.24 -0.00 0.19 -0.01 -0.02 \n",
"6 3.00 4.00 3.0 0.05 0.65 -0.00 1.39 -0.01 5.00 \n",
"... ... ... ... ... ... ... ... ... ... \n",
"139385 0.03 0.04 0.0 0.05 -0.17 -0.00 0.19 -0.01 4.50 \n",
"139644 0.03 0.04 0.0 0.05 -0.17 -0.00 0.19 -0.01 4.50 \n",
"140110 0.03 0.04 0.0 0.05 -0.17 -0.00 0.19 -0.01 -0.02 \n",
"142488 0.03 0.04 0.0 0.05 -0.17 -0.00 0.19 -0.01 3.50 \n",
"148626 0.03 0.04 0.0 0.05 -0.17 -0.00 0.19 -0.01 4.00 \n",
"\n",
"[2026 rows x 610 columns]"
]
},
"execution_count": 308,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"full_ratings"
]
},
{
"cell_type": "code",
"execution_count": 309,
"metadata": {},
"outputs": [],
"source": [
"user_movie_mat.to_csv('../data/preprocessed/user_movie_mat.csv')\n",
"users_similarity_mat.to_pickle('../data/preprocessed/users_similarity_mat.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"em tese podemos usar a similaridade por cosseno para prever a nota de um usuário no filtor por conteudo tbm. \n",
"\n",
"assim teremos as previsões de avaliações em 2 sistemas (baseado em conteúdo e colaborativo), com uma média poderada obtemos um previsão final, usando ela podemos obter uma lista final."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}