{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cast</th>\n",
       "      <th>director</th>\n",
       "      <th>keywords</th>\n",
       "      <th>overview</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>movieId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>117529</th>\n",
       "      <td>['Chris Pratt', 'Bryce Dallas Howard', 'Irrfan...</td>\n",
       "      <td>Colin Trevorrow</td>\n",
       "      <td>['monster', 'dna', 'tyrannosaurus rex', 'veloc...</td>\n",
       "      <td>Twenty-two years after the events of Jurassic ...</td>\n",
       "      <td>Jurassic World</td>\n",
       "      <td>['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Th...</td>\n",
       "      <td>2015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      cast         director  \\\n",
       "movieId                                                                       \n",
       "117529   ['Chris Pratt', 'Bryce Dallas Howard', 'Irrfan...  Colin Trevorrow   \n",
       "\n",
       "                                                  keywords  \\\n",
       "movieId                                                      \n",
       "117529   ['monster', 'dna', 'tyrannosaurus rex', 'veloc...   \n",
       "\n",
       "                                                  overview            title  \\\n",
       "movieId                                                                       \n",
       "117529   Twenty-two years after the events of Jurassic ...  Jurassic World    \n",
       "\n",
       "                                                    genres  year  \n",
       "movieId                                                           \n",
       "117529   ['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Th...  2015  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId').iloc[:,2:]\n",
    "df.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = ['cast', 'director', 'keywords', 'overview', 'title', 'genres', 'year']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_data(x):\n",
    "    if isinstance(x, list):\n",
    "        return [str.lower(i for i in x)]\n",
    "    else:\n",
    "        if isinstance(x, str):\n",
    "            return str.lower(x)\n",
    "        else:\n",
    "            return ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_soup(x):\n",
    "    return (''.join(x['cast']) + ' ' + x['director'] + ' ' + ''.join(x['keywords']) + ' ' + x['overview'] +\n",
    "               ' ' + x['title'] + ' ' + ''.join(x['genres']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "for column in columns:\n",
    "    df[column] = df[column].apply(clean_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['soup'] = df.apply(get_soup, axis=1)\n",
    "df.reset_index(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "count = CountVectorizer(stop_words='english')\n",
    "\n",
    "count_matrix = count.fit_transform(df['soup'])\n",
    "\n",
    "cosim = cosine_similarity(count_matrix, count_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_recommendations(df, title, cosim):\n",
    "    index = df[df['title'] == title+' '].index.values[0]\n",
    "\n",
    "    sim_scores = list(enumerate(cosim[index]))\n",
    "    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
    "    sim_scores = sim_scores[1:11]\n",
    "\n",
    "    movie_indexes = [i[0] for i in sim_scores]\n",
    "\n",
    "    return df['title'].iloc[movie_indexes]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "title = str.lower('Toy Story 2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1146                                           toy story \n",
       "117                                          toy story 3 \n",
       "1576                                      small soldiers \n",
       "168                                 being john malkovich \n",
       "1283                                                toys \n",
       "1952                                         stand by me \n",
       "1002    everything you always wanted to know about sex...\n",
       "239                                       monsters, inc. \n",
       "1548                                                antz \n",
       "1691                                                 big \n",
       "Name: title, dtype: object"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_recommendations(df, title, cosim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['soup'].to_csv('soup.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}