{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "movies = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId')\n", "ratings = pd.read_csv('../data/reduced/ratings_m10.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
0114.0964982703
\n", "
" ], "text/plain": [ " userId movieId rating timestamp\n", "0 1 1 4.0 964982703" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.head(1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0tmdbIdimdbIdcastdirectorkeywordsoverviewtitlegenresyear
movieId
11862114709['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...John Lasseter['jealousy', 'toy', 'boy', 'friendship', 'frie...Woody the cowboy is young Andy’s favorite to...Toy Story['Adventure', 'Animation', 'Children', 'Comedy...1995
\n", "
" ], "text/plain": [ " Unnamed: 0 tmdbId imdbId \\\n", "movieId \n", "1 1 862 114709 \n", "\n", " cast director \\\n", "movieId \n", "1 ['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim... John Lasseter \n", "\n", " keywords \\\n", "movieId \n", "1 ['jealousy', 'toy', 'boy', 'friendship', 'frie... \n", "\n", " overview title \\\n", "movieId \n", "1 Woody the cowboy is young Andy’s favorite to... Toy Story \n", "\n", " genres year \n", "movieId \n", "1 ['Adventure', 'Animation', 'Children', 'Comedy... 1995 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head(1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#ratings = ratings.head(200)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def train_test_column_split(df, group_column, split_column, y_label, train_size):\n", " df = df.sort_values(by=split_column, ascending=True) \n", " train = pd.DataFrame(columns=df.columns)\n", " test = pd.DataFrame(columns=df.columns)\n", "\n", " for idx in df[group_column].unique():\n", " group = df.loc[df[group_column] == idx]\n", "\n", " q_user = group[group[split_column].le(group[split_column].quantile(train_size))]\n", " p_user = group[group[split_column].ge(group[split_column].quantile(train_size))]\n", "\n", " train = pd.concat([train, q_user])\n", " test = pd.concat([test, p_user])\n", " train = train.sort_index(ascending=True)\n", " test = test.sort_index(ascending=True)\n", "\n", " X_labels = [c for c in df.columns if c != y_label]\n", "\n", " X_train = train[X_labels]\n", " X_test = test[X_labels]\n", " y_train = train[y_label]\n", " y_test = test[y_label]\n", "\n", " return (X_train, X_test, y_train, y_test)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_column_split(ratings, 'userId', 'timestamp', 'rating', .9)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdtimestamp
011964982703
113964981247
216964982224
4150964982931
5170964982400
............
811096101572961493846563
811106101582381479545219
811116101590931493847704
811126101641791493845631
811156101682521493846352
\n", "

72991 rows × 3 columns

\n", "
" ], "text/plain": [ " userId movieId timestamp\n", "0 1 1 964982703\n", "1 1 3 964981247\n", "2 1 6 964982224\n", "4 1 50 964982931\n", "5 1 70 964982400\n", "... ... ... ...\n", "81109 610 157296 1493846563\n", "81110 610 158238 1479545219\n", "81111 610 159093 1493847704\n", "81112 610 164179 1493845631\n", "81115 610 168252 1493846352\n", "\n", "[72991 rows x 3 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 4.0\n", "1 4.0\n", "2 4.0\n", "4 5.0\n", "5 3.0\n", " ... \n", "81109 4.0\n", "81110 5.0\n", "81111 3.0\n", "81112 5.0\n", "81115 5.0\n", "Name: rating, Length: 72991, dtype: float64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_train" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdtimestamp
3147964983815
81151964984041
91157964984100
281527964984002
311553964984153
............
810756101151491493849607
810766101152101493849803
811056101424881493849575
811136101665281493879365
811146101682501494273047
\n", "

8621 rows × 3 columns

\n", "
" ], "text/plain": [ " userId movieId timestamp\n", "3 1 47 964983815\n", "8 1 151 964984041\n", "9 1 157 964984100\n", "28 1 527 964984002\n", "31 1 553 964984153\n", "... ... ... ...\n", "81075 610 115149 1493849607\n", "81076 610 115210 1493849803\n", "81105 610 142488 1493849575\n", "81113 610 166528 1493879365\n", "81114 610 168250 1494273047\n", "\n", "[8621 rows x 3 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3 5.0\n", "8 5.0\n", "9 5.0\n", "28 5.0\n", "31 5.0\n", " ... \n", "81075 5.0\n", "81076 4.0\n", "81105 3.5\n", "81113 4.0\n", "81114 5.0\n", "Name: rating, Length: 8621, dtype: float64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_test" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "X_train.to_csv('../data/splitted/X_train_90.csv')\n", "X_test.to_csv('../data/splitted/X_test_90.csv')\n", "np.save('../data/splitted/y_train_90.npy', y_train)\n", "np.save('../data/splitted/y_test_90.npy', y_test)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }