{ "cells": [ { "cell_type": "code", "execution_count": 277, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from fuzzywuzzy import process" ] }, { "cell_type": "code", "execution_count": 278, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
0114.0964982703
1134.0964981247
2164.0964982224
31475.0964983815
41505.0964982931
...............
811116101590933.01493847704
811126101641795.01493845631
811136101665284.01493879365
811146101682505.01494273047
811156101682525.01493846352
\n", "

81116 rows × 4 columns

\n", "
" ], "text/plain": [ " userId movieId rating timestamp\n", "0 1 1 4.0 964982703\n", "1 1 3 4.0 964981247\n", "2 1 6 4.0 964982224\n", "3 1 47 5.0 964983815\n", "4 1 50 5.0 964982931\n", "... ... ... ... ...\n", "81111 610 159093 3.0 1493847704\n", "81112 610 164179 5.0 1493845631\n", "81113 610 166528 4.0 1493879365\n", "81114 610 168250 5.0 1494273047\n", "81115 610 168252 5.0 1493846352\n", "\n", "[81116 rows x 4 columns]" ] }, "execution_count": 278, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings = pd.read_csv('../data/reduced/ratings_m10.csv')\n", "ratings.reindex()" ] }, { "cell_type": "code", "execution_count": 279, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0tmdbIdimdbIdcastdirectorkeywordsoverviewtitlegenresyear
movieId
2902909100115963['Fairuza Balk', 'Neve Campbell', 'Robin Tunne...Andrew Fleming['witch', 'suicide attempt', 'becoming an adul...A Catholic school newcomer falls in with a cli...Craft, The['Drama', 'Fantasy', 'Horror', 'Thriller']1996
\n", "
" ], "text/plain": [ " Unnamed: 0 tmdbId imdbId \\\n", "movieId \n", "290 290 9100 115963 \n", "\n", " cast director \\\n", "movieId \n", "290 ['Fairuza Balk', 'Neve Campbell', 'Robin Tunne... Andrew Fleming \n", "\n", " keywords \\\n", "movieId \n", "290 ['witch', 'suicide attempt', 'becoming an adul... \n", "\n", " overview title \\\n", "movieId \n", "290 A Catholic school newcomer falls in with a cli... Craft, The \n", "\n", " genres year \n", "movieId \n", "290 ['Drama', 'Fantasy', 'Horror', 'Thriller'] 1996 " ] }, "execution_count": 279, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId')\n", "movies.sample()" ] }, { "cell_type": "code", "execution_count": 280, "metadata": {}, "outputs": [], "source": [ "movies_title = movies[['title']]" ] }, { "cell_type": "code", "execution_count": 281, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamptitle
0114.0964982703Toy Story
1514.0847434962Toy Story
2714.51106635946Toy Story
31512.51510577970Toy Story
41714.51305696483Toy Story
..................
3585154613273.0973588711Scooby-Doo
3585255513273.0978748648Scooby-Doo
3585357113275.0966900601Scooby-Doo
3585460013272.01237710102Scooby-Doo
3585560713275.0963079647Scooby-Doo
\n", "

35856 rows × 5 columns

\n", "
" ], "text/plain": [ " userId movieId rating timestamp title\n", "0 1 1 4.0 964982703 Toy Story \n", "1 5 1 4.0 847434962 Toy Story \n", "2 7 1 4.5 1106635946 Toy Story \n", "3 15 1 2.5 1510577970 Toy Story \n", "4 17 1 4.5 1305696483 Toy Story \n", "... ... ... ... ... ...\n", "35851 546 1327 3.0 973588711 Scooby-Doo \n", "35852 555 1327 3.0 978748648 Scooby-Doo \n", "35853 571 1327 5.0 966900601 Scooby-Doo \n", "35854 600 1327 2.0 1237710102 Scooby-Doo \n", "35855 607 1327 5.0 963079647 Scooby-Doo \n", "\n", "[35856 rows x 5 columns]" ] }, "execution_count": 281, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rating_movie = ratings.merge(movies_title, on='movieId')\n", "rating_movie" ] }, { "cell_type": "code", "execution_count": 282, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamptitle
0114.0964982703Toy Story
1514.0847434962Toy Story
2714.51106635946Toy Story
31512.51510577970Toy Story
41714.51305696483Toy Story
..................
3585154613273.0973588711Scooby-Doo
3585255513273.0978748648Scooby-Doo
3585357113275.0966900601Scooby-Doo
3585460013272.01237710102Scooby-Doo
3585560713275.0963079647Scooby-Doo
\n", "

35856 rows × 5 columns

\n", "
" ], "text/plain": [ " userId movieId rating timestamp title\n", "0 1 1 4.0 964982703 Toy Story \n", "1 5 1 4.0 847434962 Toy Story \n", "2 7 1 4.5 1106635946 Toy Story \n", "3 15 1 2.5 1510577970 Toy Story \n", "4 17 1 4.5 1305696483 Toy Story \n", "... ... ... ... ... ...\n", "35851 546 1327 3.0 973588711 Scooby-Doo \n", "35852 555 1327 3.0 978748648 Scooby-Doo \n", "35853 571 1327 5.0 966900601 Scooby-Doo \n", "35854 600 1327 2.0 1237710102 Scooby-Doo \n", "35855 607 1327 5.0 963079647 Scooby-Doo \n", "\n", "[35856 rows x 5 columns]" ] }, "execution_count": 282, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rating_movie" ] }, { "cell_type": "code", "execution_count": 283, "metadata": {}, "outputs": [], "source": [ "def train_test_column_split(df: pd.DataFrame, group_column: str, split_column: str, y_label: str, train_size: float):\n", " df = df.sort_values(by=split_column, ascending=True) \n", " train = pd.DataFrame(columns=df.columns)\n", " test = pd.DataFrame(columns=df.columns)\n", "\n", " for idx in df[group_column].unique():\n", " group = df.loc[df[group_column] == idx]\n", "\n", " q_user = group[group[split_column].le(group[split_column].quantile(train_size))]\n", " p_user = group[group[split_column].ge(group[split_column].quantile(train_size))]\n", "\n", " train = pd.concat([train, q_user])\n", " test = pd.concat([test, p_user])\n", " train = train.sort_index(ascending=True)\n", " test = test.sort_index(ascending=True)\n", "\n", " X_labels = [c for c in df.columns if c != y_label]\n", "\n", " X_train = train[X_labels]\n", " X_test = test[X_labels]\n", " y_train = train[y_label]\n", " y_test = test[y_label]\n", "\n", " return (X_train, X_test, y_train, y_test)" ] }, { "cell_type": "code", "execution_count": 284, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_column_split(rating_movie, 'userId', 'timestamp', 'rating', .8)" ] }, { "cell_type": "code", "execution_count": 285, "metadata": {}, "outputs": [], "source": [ "train = pd.concat([X_train, y_train], axis=1)\n", "test = pd.concat([X_test, y_test], axis=1)" ] }, { "cell_type": "code", "execution_count": 286, "metadata": {}, "outputs": [], "source": [ "user_movie_mat = rating_movie.pivot(index='movieId', columns='userId', values='rating').fillna(0)\n", "user_movie_mat_train = train.pivot(index='movieId', columns='userId', values='rating').fillna(0)\n", "user_movie_mat_test = test.pivot(index='movieId', columns='userId', values='rating').fillna(0)" ] }, { "cell_type": "code", "execution_count": 287, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userId12345678910...601602603604605606607608609610
movieId
14.00.00.00.04.00.04.50.00.00.0...4.00.04.03.04.00.00.02.50.05.0
20.00.00.00.00.04.00.00.00.00.0...0.04.00.05.00.00.00.02.00.00.0
34.00.00.00.00.05.00.00.00.00.0...0.00.00.00.00.00.00.02.00.00.0
50.00.00.00.00.05.00.00.00.00.0...0.00.00.03.00.00.00.00.00.00.0
64.00.00.00.00.04.00.00.00.00.0...0.03.04.03.00.00.00.00.00.00.0
..................................................................
20185.00.00.50.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20190.00.00.02.00.00.00.00.00.00.0...4.50.05.00.00.00.00.00.00.00.0
20200.00.00.00.00.00.00.00.00.00.0...0.00.04.00.00.04.00.00.00.00.0
20210.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.03.50.00.0
20230.00.00.00.00.00.00.00.03.00.0...0.00.02.00.00.00.00.00.00.00.0
\n", "

766 rows × 608 columns

\n", "
" ], "text/plain": [ "userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 603 \\\n", "movieId ... \n", "1 4.0 0.0 0.0 0.0 4.0 0.0 4.5 0.0 0.0 0.0 ... 4.0 0.0 4.0 \n", "2 0.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 4.0 0.0 \n", "3 4.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "5 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "6 4.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 4.0 \n", "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n", "2018 5.0 0.0 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2019 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 4.5 0.0 5.0 \n", "2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 4.0 \n", "2021 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 ... 0.0 0.0 2.0 \n", "\n", "userId 604 605 606 607 608 609 610 \n", "movieId \n", "1 3.0 4.0 0.0 0.0 2.5 0.0 5.0 \n", "2 5.0 0.0 0.0 0.0 2.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 2.0 0.0 0.0 \n", "5 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "6 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... \n", "2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2019 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2020 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n", "2021 0.0 0.0 0.0 0.0 3.5 0.0 0.0 \n", "2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", "[766 rows x 608 columns]" ] }, "execution_count": 287, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_movie_mat_train" ] }, { "cell_type": "code", "execution_count": 288, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userId12345678910...601602603604605606607608609610
movieId
10.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.02.54.00.03.00.0
20.00.00.00.00.00.00.04.00.00.0...0.00.00.00.03.50.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
50.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
60.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.05.0
..................................................................
20180.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20190.00.00.00.00.00.05.00.00.00.0...0.00.00.00.00.04.00.00.00.00.0
20200.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20210.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.04.00.00.00.00.0
20230.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.04.50.00.0
\n", "

763 rows × 608 columns

\n", "
" ], "text/plain": [ "userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 603 \\\n", "movieId ... \n", "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n", "2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2019 0.0 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2021 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "\n", "userId 604 605 606 607 608 609 610 \n", "movieId \n", "1 0.0 0.0 2.5 4.0 0.0 3.0 0.0 \n", "2 0.0 3.5 0.0 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "6 0.0 0.0 0.0 0.0 0.0 0.0 5.0 \n", "... ... ... ... ... ... ... ... \n", "2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2019 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n", "2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2021 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n", "2023 0.0 0.0 0.0 0.0 4.5 0.0 0.0 \n", "\n", "[763 rows x 608 columns]" ] }, "execution_count": 288, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_movie_mat_test" ] }, { "cell_type": "code", "execution_count": 289, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userId12345678910...601602603604605606607608609610
movieId
14.00.00.00.04.00.04.50.00.00.0...4.00.04.03.04.02.54.02.53.05.0
20.00.00.00.00.04.00.04.00.00.0...0.04.00.05.03.50.00.02.00.00.0
34.00.00.00.00.05.00.00.00.00.0...0.00.00.00.00.00.00.02.00.00.0
50.00.00.00.00.05.00.00.00.00.0...0.00.00.03.00.00.00.00.00.00.0
64.00.00.00.00.04.00.00.00.00.0...0.03.04.03.00.00.00.00.00.05.0
..................................................................
20185.00.00.50.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20190.00.00.02.00.00.05.00.00.00.0...4.50.05.00.00.04.00.00.00.00.0
20200.00.00.00.00.00.00.00.00.00.0...0.00.04.00.00.04.00.00.00.00.0
20210.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.04.00.03.50.00.0
20230.00.00.00.00.00.00.00.03.00.0...0.00.02.00.00.00.00.04.50.00.0
\n", "

766 rows × 608 columns

\n", "
" ], "text/plain": [ "userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 603 \\\n", "movieId ... \n", "1 4.0 0.0 0.0 0.0 4.0 0.0 4.5 0.0 0.0 0.0 ... 4.0 0.0 4.0 \n", "2 0.0 0.0 0.0 0.0 0.0 4.0 0.0 4.0 0.0 0.0 ... 0.0 4.0 0.0 \n", "3 4.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "5 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "6 4.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 4.0 \n", "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n", "2018 5.0 0.0 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2019 0.0 0.0 0.0 2.0 0.0 0.0 5.0 0.0 0.0 0.0 ... 4.5 0.0 5.0 \n", "2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 4.0 \n", "2021 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 ... 0.0 0.0 2.0 \n", "\n", "userId 604 605 606 607 608 609 610 \n", "movieId \n", "1 3.0 4.0 2.5 4.0 2.5 3.0 5.0 \n", "2 5.0 3.5 0.0 0.0 2.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 2.0 0.0 0.0 \n", "5 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "6 3.0 0.0 0.0 0.0 0.0 0.0 5.0 \n", "... ... ... ... ... ... ... ... \n", "2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2019 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n", "2020 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n", "2021 0.0 0.0 4.0 0.0 3.5 0.0 0.0 \n", "2023 0.0 0.0 0.0 0.0 4.5 0.0 0.0 \n", "\n", "[766 rows x 608 columns]" ] }, "execution_count": 289, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_movie_mat" ] }, { "cell_type": "code", "execution_count": 290, "metadata": {}, "outputs": [], "source": [ "def find_correlation_between_two_users(ratings_df: pd.DataFrame, user1: str, user2: str):\n", " \"\"\"Find correlation between two users based on their rated movies using Pearson correlation\"\"\"\n", " rated_movies_by_both = ratings_df[[user1, user2]].dropna(axis=0).values\n", " user1_ratings = rated_movies_by_both[:, 0]\n", " user2_ratings = rated_movies_by_both[:, 1]\n", " return np.corrcoef(user1_ratings, user2_ratings)[0, 1]" ] }, { "cell_type": "code", "execution_count": 291, "metadata": {}, "outputs": [], "source": [ "users_list = list(user_movie_mat.columns)\n", "movies_list = list(user_movie_mat.index)\n", "\n", "#users_similarity_mat = np.array([[corr_between_users(user_movie_mat, user1, user2) for user1 in users_list] for user2 in users_list])\n", "#users_similarity_mat = pd.DataFrame(users_similarity_mat, index=users_list, columns=users_list)\n", "users_similarity_mat = pd.read_pickle('../data/preprocessed/users_similarity_mat.pkl')" ] }, { "cell_type": "code", "execution_count": 292, "metadata": {}, "outputs": [], "source": [ "def get_rated_user_for_a_movie(ratings_df: pd.DataFrame, movie: str):\n", " return ratings_df.loc[movie, :].dropna().index.values\n", "\n", "\n", "def get_top_neighbors(\n", " similarity_df: pd.DataFrame, user: str, rated_users: str, n_neighbors: int\n", "):\n", " return similarity_df[user][rated_users].nlargest(n_neighbors).to_dict()\n", "\n", "\n", "def subtract_bias(rating: float, mean_rating: float):\n", " return rating - mean_rating\n", "\n", "\n", "def get_neighbor_rating_without_bias_per_movie(\n", " ratings_df: pd.DataFrame, user: str, movie: str\n", "):\n", " \"\"\"Substract the rating of a user from the mean rating of that user to eliminate bias\"\"\"\n", " mean_rating = ratings_df[user].mean()\n", " rating = ratings_df.loc[movie, user]\n", " return subtract_bias(rating, mean_rating)\n", "\n", "\n", "def get_ratings_of_neighbors(ratings_df: pd.DataFrame, neighbors: list, movie: str):\n", " \"\"\"Get the ratings of all neighbors after adjusting for biases\"\"\"\n", " return [\n", " get_neighbor_rating_without_bias_per_movie(ratings_df, neighbor, movie)\n", " for neighbor in neighbors\n", " ]\n", "\n", "def get_weighted_average_rating_of_neighbors(ratings: list, neighbor_distance: list):\n", " weighted_sum = np.array(ratings).dot(np.array(neighbor_distance))\n", " abs_neigbor_distance = np.abs(neighbor_distance)\n", " return weighted_sum / np.sum(abs_neigbor_distance)\n", "\n", "\n", "def ger_user_rating(ratings_df: pd.DataFrame, user: str, avg_neighbor_rating: float):\n", " user_avg_rating = ratings_df[user].mean()\n", " return round(user_avg_rating + avg_neighbor_rating, 2)\n" ] }, { "cell_type": "code", "execution_count": 293, "metadata": {}, "outputs": [], "source": [ "def predict_rating(\n", " df: pd.DataFrame,\n", " similarity_df: pd.DataFrame,\n", " user: str,\n", " movie: str,\n", " n_neighbors: int = 2,\n", "):\n", " \"\"\"Predict the rating of a user for a movie based on the ratings of neighbors\"\"\"\n", " ratings_df = df.copy()\n", "\n", " rated_users = get_rated_user_for_a_movie(ratings_df, movie)\n", "\n", " top_neighbors_distance = get_top_neighbors(\n", " similarity_df, user, rated_users, n_neighbors\n", " )\n", " neighbors, distance = top_neighbors_distance.keys(), top_neighbors_distance.values()\n", "\n", " #print(f\"Top {n_neighbors} neighbors of user {user}, {movie}: {list(neighbors)}, distance: {list(distance)}\")\n", "\n", " ratings = get_ratings_of_neighbors(ratings_df, neighbors, movie)\n", " avg_neighbor_rating = get_weighted_average_rating_of_neighbors(\n", " ratings, list(distance)\n", " )\n", "\n", " return ger_user_rating(ratings_df, user, avg_neighbor_rating)" ] }, { "cell_type": "code", "execution_count": 294, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.02" ] }, "execution_count": 294, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movie_name = 'Heartbreakers'\n", "user1 = 1\n", "movie = process.extractOne(movie_name, movies['title'])[2]\n", "rating = predict_rating(user_movie_mat, users_similarity_mat, user1, movie, 10)\n", "rating" ] }, { "cell_type": "code", "execution_count": 295, "metadata": {}, "outputs": [], "source": [ "def get_n_recommendations(user: int, n: int, user_movie_mat: pd.DataFrame, movies: pd.DataFrame):\n", " full_ratings = user_movie_mat.copy()\n", " recommendations = pd.DataFrame(columns=['movieId', 'title', 'rating'])\n", "\n", " for movie, _ in full_ratings[user].items():\n", " if np.isnan(full_ratings.loc[movie, user]) or full_ratings.loc[movie, user] == 0:\n", " full_ratings.loc[movie, user] = predict_rating(user_movie_mat, users_similarity_mat, user, movie, 10)\n", " new_row = {'movieId': movie, 'title': movies.loc[movie]['title'], 'rating': full_ratings.loc[movie, user]}\n", " recommendations.loc[len(recommendations)] = new_row\n", "\n", " recommendations = recommendations.sort_values(by='rating', ascending=False)\n", " return recommendations.head(n) if n > 0 else recommendations" ] }, { "cell_type": "code", "execution_count": 296, "metadata": {}, "outputs": [], "source": [ "full_ratings = pd.read_csv('../data/preprocessed/full_ratings_comp.csv', index_col='movieId')" ] }, { "cell_type": "code", "execution_count": 310, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlerating
4031200Heartbreakers3.02
303858South Park: Bigger, Longer and Uncut2.75
243589Fallen2.58
232541George of the Jungle2.51
4941374Final Destination 22.46
4151221Evolution2.42
326924Goldfinger2.40
5491610Hard Candy2.16
3571036Great Muppet Caper, The2.13
3841129Hollow Man2.09
\n", "
" ], "text/plain": [ " movieId title rating\n", "403 1200 Heartbreakers 3.02\n", "303 858 South Park: Bigger, Longer and Uncut 2.75\n", "243 589 Fallen 2.58\n", "232 541 George of the Jungle 2.51\n", "494 1374 Final Destination 2 2.46\n", "415 1221 Evolution 2.42\n", "326 924 Goldfinger 2.40\n", "549 1610 Hard Candy 2.16\n", "357 1036 Great Muppet Caper, The 2.13\n", "384 1129 Hollow Man 2.09" ] }, "execution_count": 310, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_id = 1\n", "n_recommendations = 10\n", "\n", "get_n_recommendations(user_id, n_recommendations, user_movie_mat, movies)" ] }, { "cell_type": "code", "execution_count": 298, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userId12345678910...601602603604605606607608609610
movieId
14.00.00.00.04.00.04.50.00.00.0...4.00.04.03.04.02.54.02.53.05.0
20.00.00.00.00.04.00.04.00.00.0...0.04.00.05.03.50.00.02.00.00.0
34.00.00.00.00.05.00.00.00.00.0...0.00.00.00.00.00.00.02.00.00.0
50.00.00.00.00.05.00.00.00.00.0...0.00.00.03.00.00.00.00.00.00.0
64.00.00.00.00.04.00.00.00.00.0...0.03.04.03.00.00.00.00.00.05.0
..................................................................
20185.00.00.50.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20190.00.00.02.00.00.05.00.00.00.0...4.50.05.00.00.04.00.00.00.00.0
20200.00.00.00.00.00.00.00.00.00.0...0.00.04.00.00.04.00.00.00.00.0
20210.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.04.00.03.50.00.0
20230.00.00.00.00.00.00.00.03.00.0...0.00.02.00.00.00.00.04.50.00.0
\n", "

766 rows × 608 columns

\n", "
" ], "text/plain": [ "userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 603 \\\n", "movieId ... \n", "1 4.0 0.0 0.0 0.0 4.0 0.0 4.5 0.0 0.0 0.0 ... 4.0 0.0 4.0 \n", "2 0.0 0.0 0.0 0.0 0.0 4.0 0.0 4.0 0.0 0.0 ... 0.0 4.0 0.0 \n", "3 4.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "5 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "6 4.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 4.0 \n", "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n", "2018 5.0 0.0 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2019 0.0 0.0 0.0 2.0 0.0 0.0 5.0 0.0 0.0 0.0 ... 4.5 0.0 5.0 \n", "2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 4.0 \n", "2021 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 ... 0.0 0.0 2.0 \n", "\n", "userId 604 605 606 607 608 609 610 \n", "movieId \n", "1 3.0 4.0 2.5 4.0 2.5 3.0 5.0 \n", "2 5.0 3.5 0.0 0.0 2.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 2.0 0.0 0.0 \n", "5 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "6 3.0 0.0 0.0 0.0 0.0 0.0 5.0 \n", "... ... ... ... ... ... ... ... \n", "2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2019 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n", "2020 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n", "2021 0.0 0.0 4.0 0.0 3.5 0.0 0.0 \n", "2023 0.0 0.0 0.0 0.0 4.5 0.0 0.0 \n", "\n", "[766 rows x 608 columns]" ] }, "execution_count": 298, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_movie_mat" ] }, { "cell_type": "code", "execution_count": 299, "metadata": {}, "outputs": [], "source": [ "def store_ratings(user: int, n: int, user_movie_mat: pd.DataFrame, movies: pd.DataFrame):\n", " full_ratings = user_movie_mat.copy()\n", "\n", " for movie, _ in user_movie_mat[user].items():\n", " if np.isnan(user_movie_mat.loc[movie, user]) or user_movie_mat.loc[movie, user] == 0:\n", " user_movie_mat.loc[movie, user] = predict_rating(user_movie_mat, users_similarity_mat, user, movie, 100)" ] }, { "cell_type": "code", "execution_count": 300, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlerating
4381240Dirty Rotten Scoundrels2.98
4131200Heartbreakers2.81
311858South Park: Bigger, Longer and Uncut2.78
237541George of the Jungle2.53
4261221Evolution2.44
............
3891103Road Warrior, The (Mad Max 2)-0.12
96207Free Willy-0.12
3861096Flatliners-0.12
3851095Blood Simple-0.12
90194Black Beauty-0.12
\n", "

676 rows × 3 columns

\n", "
" ], "text/plain": [ " movieId title rating\n", "438 1240 Dirty Rotten Scoundrels 2.98\n", "413 1200 Heartbreakers 2.81\n", "311 858 South Park: Bigger, Longer and Uncut 2.78\n", "237 541 George of the Jungle 2.53\n", "426 1221 Evolution 2.44\n", ".. ... ... ...\n", "389 1103 Road Warrior, The (Mad Max 2) -0.12\n", "96 207 Free Willy -0.12\n", "386 1096 Flatliners -0.12\n", "385 1095 Blood Simple -0.12\n", "90 194 Black Beauty -0.12\n", "\n", "[676 rows x 3 columns]" ] }, "execution_count": 300, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_n_recommendations(user_id, n_recommendations, user_movie_mat_train, movies)" ] }, { "cell_type": "code", "execution_count": 301, "metadata": {}, "outputs": [], "source": [ "from math import sqrt" ] }, { "cell_type": "code", "execution_count": 302, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userId12345678910...601602603604605606607608609610
movieId
14.00.00.00.04.00.04.50.00.00.0...4.00.04.03.04.02.54.02.53.05.0
20.00.00.00.00.04.00.04.00.00.0...0.04.00.05.03.50.00.02.00.00.0
34.00.00.00.00.05.00.00.00.00.0...0.00.00.00.00.00.00.02.00.00.0
50.00.00.00.00.05.00.00.00.00.0...0.00.00.03.00.00.00.00.00.00.0
64.00.00.00.00.04.00.00.00.00.0...0.03.04.03.00.00.00.00.00.05.0
..................................................................
20185.00.00.50.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20190.00.00.02.00.00.05.00.00.00.0...4.50.05.00.00.04.00.00.00.00.0
20200.00.00.00.00.00.00.00.00.00.0...0.00.04.00.00.04.00.00.00.00.0
20210.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.04.00.03.50.00.0
20230.00.00.00.00.00.00.00.03.00.0...0.00.02.00.00.00.00.04.50.00.0
\n", "

766 rows × 608 columns

\n", "
" ], "text/plain": [ "userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 603 \\\n", "movieId ... \n", "1 4.0 0.0 0.0 0.0 4.0 0.0 4.5 0.0 0.0 0.0 ... 4.0 0.0 4.0 \n", "2 0.0 0.0 0.0 0.0 0.0 4.0 0.0 4.0 0.0 0.0 ... 0.0 4.0 0.0 \n", "3 4.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "5 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "6 4.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 4.0 \n", "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n", "2018 5.0 0.0 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2019 0.0 0.0 0.0 2.0 0.0 0.0 5.0 0.0 0.0 0.0 ... 4.5 0.0 5.0 \n", "2020 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 4.0 \n", "2021 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "2023 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 ... 0.0 0.0 2.0 \n", "\n", "userId 604 605 606 607 608 609 610 \n", "movieId \n", "1 3.0 4.0 2.5 4.0 2.5 3.0 5.0 \n", "2 5.0 3.5 0.0 0.0 2.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 2.0 0.0 0.0 \n", "5 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "6 3.0 0.0 0.0 0.0 0.0 0.0 5.0 \n", "... ... ... ... ... ... ... ... \n", "2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2019 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n", "2020 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n", "2021 0.0 0.0 4.0 0.0 3.5 0.0 0.0 \n", "2023 0.0 0.0 0.0 0.0 4.5 0.0 0.0 \n", "\n", "[766 rows x 608 columns]" ] }, "execution_count": 302, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_movie_mat" ] }, { "cell_type": "code", "execution_count": 303, "metadata": {}, "outputs": [], "source": [ "user = 1\n", "SSE = 0\n", "c = 0" ] }, { "cell_type": "code", "execution_count": 304, "metadata": {}, "outputs": [], "source": [ "for movie, _ in user_movie_mat_train[user].items():\n", " if np.isnan(user_movie_mat_train.loc[movie, user]) or user_movie_mat_train.loc[movie, user] == 0:\n", " user_movie_mat_train.loc[movie, user] = predict_rating(user_movie_mat_train, users_similarity_mat, user, movie, 100)" ] }, { "cell_type": "code", "execution_count": 305, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userId12345678910...601602603604605606607608609610
movieId
14.000.00.00.04.00.04.50.00.00.0...4.00.04.03.04.00.00.02.50.05.0
20.900.00.00.00.04.00.00.00.00.0...0.04.00.05.00.00.00.02.00.00.0
34.000.00.00.00.05.00.00.00.00.0...0.00.00.00.00.00.00.02.00.00.0
50.120.00.00.00.05.00.00.00.00.0...0.00.00.03.00.00.00.00.00.00.0
64.000.00.00.00.04.00.00.00.00.0...0.03.04.03.00.00.00.00.00.00.0
..................................................................
20185.000.00.50.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20191.210.00.02.00.00.00.00.00.00.0...4.50.05.00.00.00.00.00.00.00.0
20200.760.00.00.00.00.00.00.00.00.0...0.00.04.00.00.04.00.00.00.00.0
20211.080.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.03.50.00.0
20231.140.00.00.00.00.00.00.03.00.0...0.00.02.00.00.00.00.00.00.00.0
\n", "

766 rows × 608 columns

\n", "
" ], "text/plain": [ "userId 1 2 3 4 5 6 7 8 9 10 ... 601 602 \\\n", "movieId ... \n", "1 4.00 0.0 0.0 0.0 4.0 0.0 4.5 0.0 0.0 0.0 ... 4.0 0.0 \n", "2 0.90 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 4.0 \n", "3 4.00 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "5 0.12 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "6 4.00 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 \n", "... ... ... ... ... ... ... ... ... ... ... ... ... ... \n", "2018 5.00 0.0 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "2019 1.21 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 4.5 0.0 \n", "2020 0.76 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "2021 1.08 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "2023 1.14 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 ... 0.0 0.0 \n", "\n", "userId 603 604 605 606 607 608 609 610 \n", "movieId \n", "1 4.0 3.0 4.0 0.0 0.0 2.5 0.0 5.0 \n", "2 0.0 5.0 0.0 0.0 0.0 2.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 \n", "5 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "6 4.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... ... \n", "2018 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2019 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2020 4.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 \n", "2021 0.0 0.0 0.0 0.0 0.0 3.5 0.0 0.0 \n", "2023 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", "[766 rows x 608 columns]" ] }, "execution_count": 305, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_movie_mat_train" ] }, { "cell_type": "code", "execution_count": 306, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.1835473787967214\n" ] } ], "source": [ "\n", "for movie, _ in user_movie_mat_test[user].items():\n", " if user_movie_mat_test.loc[movie, user] != 0:\n", " #print(user_movie_mat_test.loc[movie, user], user_movie_mat_train.loc[movie, user])\n", " E = user_movie_mat_test.loc[movie, user] - user_movie_mat_train.loc[movie, user]\n", " SSE = SSE + pow(E, 2)\n", " c = c+1\n", "MSE = SSE/c\n", "RMSE = sqrt(MSE)\n", "print(RMSE)" ] }, { "cell_type": "code", "execution_count": 307, "metadata": {}, "outputs": [], "source": [ "def ger_full_ratings():\n", " full_ratings = user_movie_mat.copy()\n", "\n", " for user, movies in full_ratings.items():\n", " for movie in movies.keys():\n", " if np.isnan(full_ratings.loc[movie, user]) or full_ratings.loc[movie, user] == 0:\n", " full_ratings.loc[movie, user] = predict_rating(\n", " user_movie_mat, users_similarity_mat, user, movie\n", " )" ] }, { "cell_type": "code", "execution_count": 308, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
12345678910...601602603604605606607608609610
movieId
14.00-0.0-0.00.784.000.054.500.010.010.02...4.000.034.003.04.002.504.002.503.005.00
20.03-0.0-0.0-0.131.044.000.044.000.010.02...-0.034.000.045.03.500.650.872.00-0.011.23
34.00-0.0-0.0-0.131.045.000.040.010.010.02...-0.030.030.040.00.05-0.17-0.002.00-0.01-0.02
50.03-0.0-0.0-0.131.045.000.040.010.010.02...-0.030.030.043.00.050.24-0.000.19-0.01-0.02
64.00-0.0-0.00.781.044.000.040.010.010.02...-0.033.004.003.00.050.65-0.001.39-0.015.00
..................................................................
1393850.03-0.0-0.0-0.13-0.020.050.040.010.010.02...1.790.030.040.00.05-0.17-0.000.19-0.014.50
1396440.03-0.0-0.0-0.13-0.020.050.040.010.010.02...-0.030.030.040.00.05-0.17-0.000.19-0.014.50
1401100.03-0.0-0.0-0.13-0.020.050.040.010.015.00...-0.030.030.040.00.05-0.17-0.000.19-0.01-0.02
1424880.03-0.0-0.0-0.13-0.020.050.040.010.010.02...1.790.030.040.00.05-0.17-0.000.19-0.013.50
1486260.03-0.0-0.0-0.13-0.020.050.040.010.010.02...-0.030.030.040.00.05-0.17-0.000.19-0.014.00
\n", "

2026 rows × 610 columns

\n", "
" ], "text/plain": [ " 1 2 3 4 5 6 7 8 9 10 ... 601 \\\n", "movieId ... \n", "1 4.00 -0.0 -0.0 0.78 4.00 0.05 4.50 0.01 0.01 0.02 ... 4.00 \n", "2 0.03 -0.0 -0.0 -0.13 1.04 4.00 0.04 4.00 0.01 0.02 ... -0.03 \n", "3 4.00 -0.0 -0.0 -0.13 1.04 5.00 0.04 0.01 0.01 0.02 ... -0.03 \n", "5 0.03 -0.0 -0.0 -0.13 1.04 5.00 0.04 0.01 0.01 0.02 ... -0.03 \n", "6 4.00 -0.0 -0.0 0.78 1.04 4.00 0.04 0.01 0.01 0.02 ... -0.03 \n", "... ... ... ... ... ... ... ... ... ... ... ... ... \n", "139385 0.03 -0.0 -0.0 -0.13 -0.02 0.05 0.04 0.01 0.01 0.02 ... 1.79 \n", "139644 0.03 -0.0 -0.0 -0.13 -0.02 0.05 0.04 0.01 0.01 0.02 ... -0.03 \n", "140110 0.03 -0.0 -0.0 -0.13 -0.02 0.05 0.04 0.01 0.01 5.00 ... -0.03 \n", "142488 0.03 -0.0 -0.0 -0.13 -0.02 0.05 0.04 0.01 0.01 0.02 ... 1.79 \n", "148626 0.03 -0.0 -0.0 -0.13 -0.02 0.05 0.04 0.01 0.01 0.02 ... -0.03 \n", "\n", " 602 603 604 605 606 607 608 609 610 \n", "movieId \n", "1 0.03 4.00 3.0 4.00 2.50 4.00 2.50 3.00 5.00 \n", "2 4.00 0.04 5.0 3.50 0.65 0.87 2.00 -0.01 1.23 \n", "3 0.03 0.04 0.0 0.05 -0.17 -0.00 2.00 -0.01 -0.02 \n", "5 0.03 0.04 3.0 0.05 0.24 -0.00 0.19 -0.01 -0.02 \n", "6 3.00 4.00 3.0 0.05 0.65 -0.00 1.39 -0.01 5.00 \n", "... ... ... ... ... ... ... ... ... ... \n", "139385 0.03 0.04 0.0 0.05 -0.17 -0.00 0.19 -0.01 4.50 \n", "139644 0.03 0.04 0.0 0.05 -0.17 -0.00 0.19 -0.01 4.50 \n", "140110 0.03 0.04 0.0 0.05 -0.17 -0.00 0.19 -0.01 -0.02 \n", "142488 0.03 0.04 0.0 0.05 -0.17 -0.00 0.19 -0.01 3.50 \n", "148626 0.03 0.04 0.0 0.05 -0.17 -0.00 0.19 -0.01 4.00 \n", "\n", "[2026 rows x 610 columns]" ] }, "execution_count": 308, "metadata": {}, "output_type": "execute_result" } ], "source": [ "full_ratings" ] }, { "cell_type": "code", "execution_count": 309, "metadata": {}, "outputs": [], "source": [ "user_movie_mat.to_csv('../data/preprocessed/user_movie_mat.csv')\n", "users_similarity_mat.to_pickle('../data/preprocessed/users_similarity_mat.pkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "em tese podemos usar a similaridade por cosseno para prever a nota de um usuário no filtor por conteudo tbm. \n", "\n", "assim teremos as previsões de avaliações em 2 sistemas (baseado em conteúdo e colaborativo), com uma média poderada obtemos um previsão final, usando ela podemos obter uma lista final." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }