{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.pipeline import make_pipeline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Você pode baixar o conjunto de dados MovieLens 100k em https://grouplens.org/datasets/movielens/\n", "# Carregue os arquivos 'movies.csv' e 'ratings.csv'\n", "\n", "movies = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId')\n", "ratings = pd.read_csv('../data/reduced/ratings_m10.csv')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "movies = movies[['title', 'genres']]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | title | \n", "genres | \n", "
---|---|---|
movieId | \n", "\n", " | \n", " |
1 | \n", "Toy Story | \n", "['Adventure', 'Animation', 'Children', 'Comedy... | \n", "
2 | \n", "Jumanji | \n", "['Adventure', 'Children', 'Fantasy'] | \n", "
3 | \n", "Grumpier Old Men | \n", "['Comedy', 'Romance'] | \n", "
4 | \n", "Father of the Bride Part II | \n", "['Comedy'] | \n", "
5 | \n", "Heat | \n", "['Action', 'Crime', 'Thriller'] | \n", "
... | \n", "... | \n", "... | \n", "
2022 | \n", "The Revenant | \n", "['Adventure', 'Drama'] | \n", "
2023 | \n", "Sicario | \n", "['Crime', 'Drama', 'Mystery'] | \n", "
2024 | \n", "The Intern | \n", "['Comedy'] | \n", "
2025 | \n", "Spotlight | \n", "['Thriller'] | \n", "
2026 | \n", "Big Short, The | \n", "['Drama'] | \n", "
2026 rows × 2 columns
\n", "\n", " | userId | \n", "movieId | \n", "rating | \n", "timestamp | \n", "title | \n", "genres | \n", "0 | \n", "1 | \n", "2 | \n", "3 | \n", "... | \n", "11 | \n", "12 | \n", "13 | \n", "14 | \n", "15 | \n", "16 | \n", "17 | \n", "18 | \n", "19 | \n", "20 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "1 | \n", "4.0 | \n", "964982703 | \n", "Toy Story | \n", "['Adventure', 'Animation', 'Children', 'Comedy... | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1 | \n", "5 | \n", "1 | \n", "4.0 | \n", "847434962 | \n", "Toy Story | \n", "['Adventure', 'Animation', 'Children', 'Comedy... | \n", "0.0 | \n", "0.369385 | \n", "0.564013 | \n", "0.495978 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2 | \n", "7 | \n", "1 | \n", "4.5 | \n", "1106635946 | \n", "Toy Story | \n", "['Adventure', 'Animation', 'Children', 'Comedy... | \n", "0.0 | \n", "0.474450 | \n", "0.000000 | \n", "0.637051 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
3 | \n", "15 | \n", "1 | \n", "2.5 | \n", "1510577970 | \n", "Toy Story | \n", "['Adventure', 'Animation', 'Children', 'Comedy... | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.819299 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
4 | \n", "17 | \n", "1 | \n", "4.5 | \n", "1305696483 | \n", "Toy Story | \n", "['Adventure', 'Animation', 'Children', 'Comedy... | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
35851 | \n", "546 | \n", "1327 | \n", "3.0 | \n", "973588711 | \n", "Scooby-Doo | \n", "['Adventure', 'Children', 'Comedy', 'Fantasy',... | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
35852 | \n", "555 | \n", "1327 | \n", "3.0 | \n", "978748648 | \n", "Scooby-Doo | \n", "['Adventure', 'Children', 'Comedy', 'Fantasy',... | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
35853 | \n", "571 | \n", "1327 | \n", "5.0 | \n", "966900601 | \n", "Scooby-Doo | \n", "['Adventure', 'Children', 'Comedy', 'Fantasy',... | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
35854 | \n", "600 | \n", "1327 | \n", "2.0 | \n", "1237710102 | \n", "Scooby-Doo | \n", "['Adventure', 'Children', 'Comedy', 'Fantasy',... | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
35855 | \n", "607 | \n", "1327 | \n", "5.0 | \n", "963079647 | \n", "Scooby-Doo | \n", "['Adventure', 'Children', 'Comedy', 'Fantasy',... | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
35856 rows × 27 columns
\n", "\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "8 | \n", "9 | \n", "... | \n", "11 | \n", "12 | \n", "13 | \n", "14 | \n", "15 | \n", "16 | \n", "17 | \n", "18 | \n", "19 | \n", "20 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
21985 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
13726 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
25385 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
26798 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
10153 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
16850 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
6265 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
11284 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
860 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
15795 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
28684 rows × 21 columns
\n", "Pipeline(steps=[('linearregression', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('linearregression', LinearRegression())])
LinearRegression()