In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

In [3]:
# Você pode baixar o conjunto de dados MovieLens 100k em https://grouplens.org/datasets/movielens/
# Carregue os arquivos 'movies.csv' e 'ratings.csv'

movies = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId')
ratings = pd.read_csv('../data/reduced/ratings_m10.csv')

In [4]:
movies = movies[['title', 'genres']]

In [5]:
movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
2,Jumanji,"['Adventure', 'Children', 'Fantasy']"
3,Grumpier Old Men,"['Comedy', 'Romance']"
4,Father of the Bride Part II,['Comedy']
5,Heat,"['Action', 'Crime', 'Thriller']"
...,...,...
2022,The Revenant,"['Adventure', 'Drama']"
2023,Sicario,"['Crime', 'Drama', 'Mystery']"
2024,The Intern,['Comedy']
2025,Spotlight,['Thriller']


In [6]:
# Junte os conjuntos de dados 'movies' e 'ratings' usando a coluna 'movieId'
data = pd.merge(ratings, movies, on='movieId')

In [7]:
# Criar uma matriz de recursos TF-IDF para a descrição do filme
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'])

# Adicionar a matriz TF-IDF ao conjunto de dados
movies_tfidf = pd.DataFrame(tfidf_matrix.toarray(), index=movies.index)

# Concatenar o conjunto de dados original com a matriz TF-IDF
data = pd.concat([data, movies_tfidf], axis=1).fillna(0)


In [8]:
data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,0,1,2,3,...,11,12,13,14,15,16,17,18,19,20
0,1,1,4.0,964982703,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,5,1,4.0,847434962,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",0.0,0.369385,0.564013,0.495978,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,7,1,4.5,1106635946,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",0.0,0.474450,0.000000,0.637051,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,15,1,2.5,1510577970,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.819299,0.0,0.0,0.0,0.0
4,17,1,4.5,1305696483,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35851,546,1327,3.0,973588711,Scooby-Doo,"['Adventure', 'Children', 'Comedy', 'Fantasy',...",0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
35852,555,1327,3.0,978748648,Scooby-Doo,"['Adventure', 'Children', 'Comedy', 'Fantasy',...",0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
35853,571,1327,5.0,966900601,Scooby-Doo,"['Adventure', 'Children', 'Comedy', 'Fantasy',...",0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
35854,600,1327,2.0,1237710102,Scooby-Doo,"['Adventure', 'Children', 'Comedy', 'Fantasy',...",0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [9]:
#data.columns = data.columns.astype(str)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35856 entries, 0 to 35855
Data columns (total 27 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     35856 non-null  int64  
 1   movieId    35856 non-null  int64  
 2   rating     35856 non-null  float64
 3   timestamp  35856 non-null  int64  
 4   title      35856 non-null  object 
 5   genres     35856 non-null  object 
 6   0          35856 non-null  float64
 7   1          35856 non-null  float64
 8   2          35856 non-null  float64
 9   3          35856 non-null  float64
 10  4          35856 non-null  float64
 11  5          35856 non-null  float64
 12  6          35856 non-null  float64
 13  7          35856 non-null  float64
 14  8          35856 non-null  float64
 15  9          35856 non-null  float64
 16  10         35856 non-null  float64
 17  11         35856 non-null  float64
 18  12         35856 non-null  float64
 19  13         35856 non-null  float64
 20  14    

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres'], axis=1), data['rating'], test_size=0.2, random_state=42)

In [12]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
21985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
model = make_pipeline(LinearRegression())
model.fit(X_train, y_train)


In [14]:
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 1.084486723654248


In [15]:
# Substitua user_id e movie_id pelos valores desejados
user_id = 1
movie_id = 1

user_movie_data = data[(data['userId'] == user_id) & (data['movieId'] == movie_id)].drop(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres'], axis=1)
prediction = model.predict(user_movie_data)

print(f'Previsão de nota para o filme: {prediction[0]}')

Previsão de nota para o filme: 3.600845094489842
