{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.pipeline import make_pipeline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Você pode baixar o conjunto de dados MovieLens 100k em https://grouplens.org/datasets/movielens/\n", "# Carregue os arquivos 'movies.csv' e 'ratings.csv'\n", "\n", "movies = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId')\n", "ratings = pd.read_csv('../data/reduced/ratings_m10.csv')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "movies = movies[['title', 'genres']]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegenres
movieId
1Toy Story['Adventure', 'Animation', 'Children', 'Comedy...
2Jumanji['Adventure', 'Children', 'Fantasy']
3Grumpier Old Men['Comedy', 'Romance']
4Father of the Bride Part II['Comedy']
5Heat['Action', 'Crime', 'Thriller']
.........
2022The Revenant['Adventure', 'Drama']
2023Sicario['Crime', 'Drama', 'Mystery']
2024The Intern['Comedy']
2025Spotlight['Thriller']
2026Big Short, The['Drama']
\n", "

2026 rows × 2 columns

\n", "
" ], "text/plain": [ " title \\\n", "movieId \n", "1 Toy Story \n", "2 Jumanji \n", "3 Grumpier Old Men \n", "4 Father of the Bride Part II \n", "5 Heat \n", "... ... \n", "2022 The Revenant \n", "2023 Sicario \n", "2024 The Intern \n", "2025 Spotlight \n", "2026 Big Short, The \n", "\n", " genres \n", "movieId \n", "1 ['Adventure', 'Animation', 'Children', 'Comedy... \n", "2 ['Adventure', 'Children', 'Fantasy'] \n", "3 ['Comedy', 'Romance'] \n", "4 ['Comedy'] \n", "5 ['Action', 'Crime', 'Thriller'] \n", "... ... \n", "2022 ['Adventure', 'Drama'] \n", "2023 ['Crime', 'Drama', 'Mystery'] \n", "2024 ['Comedy'] \n", "2025 ['Thriller'] \n", "2026 ['Drama'] \n", "\n", "[2026 rows x 2 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Junte os conjuntos de dados 'movies' e 'ratings' usando a coluna 'movieId'\n", "data = pd.merge(ratings, movies, on='movieId')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Criar uma matriz de recursos TF-IDF para a descrição do filme\n", "tfidf_vectorizer = TfidfVectorizer(stop_words='english')\n", "tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'])\n", "\n", "# Adicionar a matriz TF-IDF ao conjunto de dados\n", "movies_tfidf = pd.DataFrame(tfidf_matrix.toarray(), index=movies.index)\n", "\n", "# Concatenar o conjunto de dados original com a matriz TF-IDF\n", "data = pd.concat([data, movies_tfidf], axis=1).fillna(0)\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamptitlegenres0123...11121314151617181920
0114.0964982703Toy Story['Adventure', 'Animation', 'Children', 'Comedy...0.00.0000000.0000000.000000...0.00.00.00.00.00.0000000.00.00.00.0
1514.0847434962Toy Story['Adventure', 'Animation', 'Children', 'Comedy...0.00.3693850.5640130.495978...0.00.00.00.00.00.0000000.00.00.00.0
2714.51106635946Toy Story['Adventure', 'Animation', 'Children', 'Comedy...0.00.4744500.0000000.637051...0.00.00.00.00.00.0000000.00.00.00.0
31512.51510577970Toy Story['Adventure', 'Animation', 'Children', 'Comedy...0.00.0000000.0000000.000000...0.00.00.00.00.00.8192990.00.00.00.0
41714.51305696483Toy Story['Adventure', 'Animation', 'Children', 'Comedy...0.00.0000000.0000000.000000...0.00.00.00.00.00.0000000.00.00.00.0
..................................................................
3585154613273.0973588711Scooby-Doo['Adventure', 'Children', 'Comedy', 'Fantasy',...0.00.0000000.0000000.000000...0.00.00.00.00.00.0000000.00.00.00.0
3585255513273.0978748648Scooby-Doo['Adventure', 'Children', 'Comedy', 'Fantasy',...0.00.0000000.0000000.000000...0.00.00.00.00.00.0000000.00.00.00.0
3585357113275.0966900601Scooby-Doo['Adventure', 'Children', 'Comedy', 'Fantasy',...0.00.0000000.0000000.000000...0.00.00.00.00.00.0000000.00.00.00.0
3585460013272.01237710102Scooby-Doo['Adventure', 'Children', 'Comedy', 'Fantasy',...0.00.0000000.0000000.000000...0.00.00.00.00.00.0000000.00.00.00.0
3585560713275.0963079647Scooby-Doo['Adventure', 'Children', 'Comedy', 'Fantasy',...0.00.0000000.0000000.000000...0.00.00.00.00.00.0000000.00.00.00.0
\n", "

35856 rows × 27 columns

\n", "
" ], "text/plain": [ " userId movieId rating timestamp title \\\n", "0 1 1 4.0 964982703 Toy Story \n", "1 5 1 4.0 847434962 Toy Story \n", "2 7 1 4.5 1106635946 Toy Story \n", "3 15 1 2.5 1510577970 Toy Story \n", "4 17 1 4.5 1305696483 Toy Story \n", "... ... ... ... ... ... \n", "35851 546 1327 3.0 973588711 Scooby-Doo \n", "35852 555 1327 3.0 978748648 Scooby-Doo \n", "35853 571 1327 5.0 966900601 Scooby-Doo \n", "35854 600 1327 2.0 1237710102 Scooby-Doo \n", "35855 607 1327 5.0 963079647 Scooby-Doo \n", "\n", " genres 0 1 \\\n", "0 ['Adventure', 'Animation', 'Children', 'Comedy... 0.0 0.000000 \n", "1 ['Adventure', 'Animation', 'Children', 'Comedy... 0.0 0.369385 \n", "2 ['Adventure', 'Animation', 'Children', 'Comedy... 0.0 0.474450 \n", "3 ['Adventure', 'Animation', 'Children', 'Comedy... 0.0 0.000000 \n", "4 ['Adventure', 'Animation', 'Children', 'Comedy... 0.0 0.000000 \n", "... ... ... ... \n", "35851 ['Adventure', 'Children', 'Comedy', 'Fantasy',... 0.0 0.000000 \n", "35852 ['Adventure', 'Children', 'Comedy', 'Fantasy',... 0.0 0.000000 \n", "35853 ['Adventure', 'Children', 'Comedy', 'Fantasy',... 0.0 0.000000 \n", "35854 ['Adventure', 'Children', 'Comedy', 'Fantasy',... 0.0 0.000000 \n", "35855 ['Adventure', 'Children', 'Comedy', 'Fantasy',... 0.0 0.000000 \n", "\n", " 2 3 ... 11 12 13 14 15 16 17 18 \\\n", "0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n", "1 0.564013 0.495978 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n", "2 0.000000 0.637051 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n", "3 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.819299 0.0 0.0 \n", "4 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n", "... ... ... ... ... ... ... ... ... ... ... ... \n", "35851 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n", "35852 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n", "35853 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n", "35854 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n", "35855 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n", "\n", " 19 20 \n", "0 0.0 0.0 \n", "1 0.0 0.0 \n", "2 0.0 0.0 \n", "3 0.0 0.0 \n", "4 0.0 0.0 \n", "... ... ... \n", "35851 0.0 0.0 \n", "35852 0.0 0.0 \n", "35853 0.0 0.0 \n", "35854 0.0 0.0 \n", "35855 0.0 0.0 \n", "\n", "[35856 rows x 27 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "#data.columns = data.columns.astype(str)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 35856 entries, 0 to 35855\n", "Data columns (total 27 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 userId 35856 non-null int64 \n", " 1 movieId 35856 non-null int64 \n", " 2 rating 35856 non-null float64\n", " 3 timestamp 35856 non-null int64 \n", " 4 title 35856 non-null object \n", " 5 genres 35856 non-null object \n", " 6 0 35856 non-null float64\n", " 7 1 35856 non-null float64\n", " 8 2 35856 non-null float64\n", " 9 3 35856 non-null float64\n", " 10 4 35856 non-null float64\n", " 11 5 35856 non-null float64\n", " 12 6 35856 non-null float64\n", " 13 7 35856 non-null float64\n", " 14 8 35856 non-null float64\n", " 15 9 35856 non-null float64\n", " 16 10 35856 non-null float64\n", " 17 11 35856 non-null float64\n", " 18 12 35856 non-null float64\n", " 19 13 35856 non-null float64\n", " 20 14 35856 non-null float64\n", " 21 15 35856 non-null float64\n", " 22 16 35856 non-null float64\n", " 23 17 35856 non-null float64\n", " 24 18 35856 non-null float64\n", " 25 19 35856 non-null float64\n", " 26 20 35856 non-null float64\n", "dtypes: float64(22), int64(3), object(2)\n", "memory usage: 7.4+ MB\n" ] } ], "source": [ "data.info()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(data.drop(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres'], axis=1), data['rating'], test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...11121314151617181920
219850.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
137260.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
253850.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
267980.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
101530.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
..................................................................
168500.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
62650.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
112840.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
8600.00.00.00.00.00.00.01.00.00.0...0.00.00.00.00.00.00.00.00.00.0
157950.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", "

28684 rows × 21 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 11 12 13 \\\n", "21985 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "13726 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "25385 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "26798 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "10153 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n", "16850 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "6265 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "11284 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "860 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "15795 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", "\n", " 14 15 16 17 18 19 20 \n", "21985 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "13726 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "25385 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "26798 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "10153 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... \n", "16850 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "6265 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "11284 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "860 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "15795 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", "[28684 rows x 21 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('linearregression', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('linearregression', LinearRegression())])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = make_pipeline(LinearRegression())\n", "model.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean Squared Error: 1.084486723654248\n" ] } ], "source": [ "predictions = model.predict(X_test)\n", "mse = mean_squared_error(y_test, predictions)\n", "print(f'Mean Squared Error: {mse}')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Previsão de nota para o filme: 3.600845094489842\n" ] } ], "source": [ "# Substitua user_id e movie_id pelos valores desejados\n", "user_id = 1\n", "movie_id = 1\n", "\n", "user_movie_data = data[(data['userId'] == user_id) & (data['movieId'] == movie_id)].drop(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres'], axis=1)\n", "prediction = model.predict(user_movie_data)\n", "\n", "print(f'Previsão de nota para o filme: {prediction[0]}')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }