File size: 3,435 Bytes
9697a93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor


def train_model(data,user_id, test=None, eval = False):


    # select only user data
    train_user = data[data['userId']==user_id]


    X_train = train_user.drop(columns=['userId','rating', 'Train', 'title'])
    y_train = train_user['rating']

    model = XGBRegressor()
    model.fit(X_train,y_train)

    if eval:
        test_user = test[test['userId']== user_id]
        X_test = test_user.drop(columns=['userId','rating', 'Train', 'title'])
        y_test = test_user['rating']
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        print(f'RMSE: {rmse:.4f}')
    # Model evaluation
    # print("Predected rating:", y_pred)
    # print("Actual rating:",y_test)
    # print(X_test)

    return model

def get_user_recommendation_XGBoost(all_moves,model, user_id, n=10):
    # get all movies that the user has not seen
    user_seen_movies = all_moves[all_moves['userId'] == user_id]['title']
    user_unseen_movies = all_moves[~all_moves['title'].isin(user_seen_movies)]

    # drop duplicates
    user_unseen_movies = user_unseen_movies.drop_duplicates(subset=['title'])

    # make predictions
    user_unseen_movies['Pred_rating'] = model.predict(user_unseen_movies.drop(columns=['userId', 'rating', 'Train', 'title']))

    # only return movies with more than 100 ratings

    # get top 10 recommendations
    recommendations = user_unseen_movies.sort_values(by='Pred_rating', ascending=False).head(n)['title']
    return recommendations ,user_seen_movies



def get_user_recommendation(DataBase, Matrix,user_id,l=10):
    user = Matrix[user_id]
    user = user.sort_values(ascending=False)
    # now we have a series of user similarities
    # we only want to recommend movies that the user has not seen
    # so we need to filter out movies that the user has seen
    user_seen_movies = DataBase[DataBase['userId'] == user_id]['title']


    # Now we loop through user and get top 10 recommendations
    recommendations = []
    print(len(user.index))
    for U in user.index[1:10]:
        # get all rated movies by user U
        movies = DataBase[DataBase['userId'] == U]['title']
        movies = movies[~movies.isin(user_seen_movies)]

        # get all movies that U has rated 4 or higher
        movies = movies[DataBase['rating'] >= 4]
        # sort by rating
        movies = movies.sort_values(ascending=False)
        for movie in movies[:4]:   
            if movie not in recommendations:
                recommendations.append(movie) 
        
        if len(recommendations) >= l:
            break


       
    return recommendations





if __name__ == '__main__':
    import pickle

    def load_similarity_matrix(path):
        with open(path, 'rb') as f:
            similarity_df = pickle.load(f)
        return similarity_df

    # Load the data
    DataBaseCSV = r"D:\Study\ITI\Recommender Systems\Final\Movies-Recommender-System\Data\XGBoost_database.csv"
    DataBase = pd.read_csv(DataBaseCSV)
    # Load the similarity matrix
    MatrixCSV = r"D:\Study\ITI\Recommender Systems\Final\Movies-Recommender-System\Models\user_based_matrix.pkl"
    Matrix = load_similarity_matrix(MatrixCSV)   
    recommendations = get_user_recommendation(DataBase, Matrix,1)
    print(recommendations)