import pandas as pd from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics import mean_squared_error from xgboost import XGBRegressor def train_model(data,user_id, test=None, eval = False): # select only user data train_user = data[data['userId']==user_id] X_train = train_user.drop(columns=['userId','rating', 'Train', 'title']) y_train = train_user['rating'] model = XGBRegressor() model.fit(X_train,y_train) if eval: test_user = test[test['userId']== user_id] X_test = test_user.drop(columns=['userId','rating', 'Train', 'title']) y_test = test_user['rating'] y_pred = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f'RMSE: {rmse:.4f}') # Model evaluation # print("Predected rating:", y_pred) # print("Actual rating:",y_test) # print(X_test) return model def get_user_recommendation_XGBoost(all_moves,model, user_id, n=10): # get all movies that the user has not seen user_seen_movies = all_moves[all_moves['userId'] == user_id]['title'] user_unseen_movies = all_moves[~all_moves['title'].isin(user_seen_movies)] # drop duplicates user_unseen_movies = user_unseen_movies.drop_duplicates(subset=['title']) # make predictions user_unseen_movies['Pred_rating'] = model.predict(user_unseen_movies.drop(columns=['userId', 'rating', 'Train', 'title'])) # only return movies with more than 100 ratings # get top 10 recommendations recommendations = user_unseen_movies.sort_values(by='Pred_rating', ascending=False).head(n)['title'] return recommendations ,user_seen_movies def seen_movies(dataBase,user_id): return dataBase[dataBase['userId'] == user_id]['title'].values def get_user_recommendation(DataBase, Matrix,user_id,l=10): user = Matrix[user_id] user = user.sort_values(ascending=False) # now we have a series of user similarities # we only want to recommend movies that the user has not seen # so we need to filter out movies that the user has seen user_seen_movies = DataBase[DataBase['userId'] == user_id]['title'] # Now we loop through user and get top 10 recommendations recommendations = [] print(len(user.index)) for U in user.index[1:10]: # get all rated movies by user U movies = DataBase[DataBase['userId'] == U]['title'] movies = movies[~movies.isin(user_seen_movies)] # get all movies that U has rated 4 or higher movies = movies[DataBase['rating'] >= 4] # sort by rating movies = movies.sort_values(ascending=False) for movie in movies[:4]: if movie not in recommendations: recommendations.append(movie) if len(recommendations) >= l: break return recommendations def get_recommendation_item(dataBase,matrix, movie_name, n=10): similar_scores = matrix[movie_name] similar_scores = similar_scores.sort_values(ascending=False) # only return movies with more than 100 ratings similar_scores = similar_scores[similar_scores.index.isin(dataBase[dataBase['number_of_ratings'] > 100].index)][:n] return similar_scores if __name__ == '__main__': import pickle def load_similarity_matrix(path): with open(path, 'rb') as f: similarity_df = pickle.load(f) return similarity_df # Load the data DataBaseCSV = r"D:\Study\ITI\Recommender Systems\Final\Movies-Recommender-System\Data\XGBoost_database.csv" DataBase = pd.read_csv(DataBaseCSV) # Load the similarity matrix MatrixCSV = r"D:\Study\ITI\Recommender Systems\Final\Movies-Recommender-System\Models\user_based_matrix.pkl" Matrix = load_similarity_matrix(MatrixCSV) recommendations = get_user_recommendation(DataBase, Matrix,1) print(recommendations)