{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
castdirectorkeywordsoverviewtitlegenresyear
movieId
117529['Chris Pratt', 'Bryce Dallas Howard', 'Irrfan...Colin Trevorrow['monster', 'dna', 'tyrannosaurus rex', 'veloc...Twenty-two years after the events of Jurassic ...Jurassic World['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Th...2015
\n", "
" ], "text/plain": [ " cast director \\\n", "movieId \n", "117529 ['Chris Pratt', 'Bryce Dallas Howard', 'Irrfan... Colin Trevorrow \n", "\n", " keywords \\\n", "movieId \n", "117529 ['monster', 'dna', 'tyrannosaurus rex', 'veloc... \n", "\n", " overview title \\\n", "movieId \n", "117529 Twenty-two years after the events of Jurassic ... Jurassic World \n", "\n", " genres year \n", "movieId \n", "117529 ['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Th... 2015 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId').iloc[:,2:]\n", "df.head(1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "columns = ['cast', 'director', 'keywords', 'overview', 'title', 'genres', 'year']" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def clean_data(x):\n", " if isinstance(x, list):\n", " return [str.lower(i for i in x)]\n", " else:\n", " if isinstance(x, str):\n", " return str.lower(x)\n", " else:\n", " return ''" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def get_soup(x):\n", " return (''.join(x['cast']) + ' ' + x['director'] + ' ' + ''.join(x['keywords']) + ' ' + x['overview'] +\n", " ' ' + x['title'] + ' ' + ''.join(x['genres']))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "for column in columns:\n", " df[column] = df[column].apply(clean_data)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df['soup'] = df.apply(get_soup, axis=1)\n", "df.reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "count = CountVectorizer(stop_words='english')\n", "\n", "count_matrix = count.fit_transform(df['soup'])\n", "\n", "cosim = cosine_similarity(count_matrix, count_matrix)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def get_recommendations(df, title, cosim):\n", " index = df[df['title'] == title+' '].index.values[0]\n", "\n", " sim_scores = list(enumerate(cosim[index]))\n", " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", " sim_scores = sim_scores[1:11]\n", "\n", " movie_indexes = [i[0] for i in sim_scores]\n", "\n", " return df['title'].iloc[movie_indexes]\n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "title = str.lower('Toy Story 2')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1146 toy story \n", "117 toy story 3 \n", "1576 small soldiers \n", "168 being john malkovich \n", "1283 toys \n", "1952 stand by me \n", "1002 everything you always wanted to know about sex...\n", "239 monsters, inc. \n", "1548 antz \n", "1691 big \n", "Name: title, dtype: object" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_recommendations(df, title, cosim)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "df['soup'].to_csv('soup.csv')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }