aminian
/

ML-final-project

code

Model card Files Files and versions Community

aminian commited on Jan 24, 2023

Commit

11e3994

•

1 Parent(s): b6c882b

Implement MLOps API

Browse files

Files changed (1) hide show

recommender.py +142 -0

recommender.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import numpy as np
+import pandas as pd
+from sklearn.utils.extmath import randomized_svd
+import mlflow
+def recommend_top_k(preds_df, ratings_df, movie, userId, k=10):
+    user_row = userId - 1
+    sorted_user_predictions = preds_df.iloc[user_row].sort_values(ascending=False)
+    user_data = ratings_df[ratings_df.userId == (userId)]
+    user_rated = user_data.merge(movie, how='left', left_on='movieId', right_on='movieId'). \
+        sort_values(['rating'], ascending=False)
+    user_preds = movie.merge(pd.DataFrame(sorted_user_predictions).reset_index(), how='left',
+                             on='movieId').rename(columns={user_row: 'prediction'}). \
+                     sort_values('prediction', ascending=False). \
+                     iloc[:k, :]
+    return user_rated, user_preds
+def precision_at_k(df, k=10, y_test: str = 'rating', y_pred='prediction'):
+    dfK = df.head(k)
+    sum_df = dfK[y_pred].sum()
+    true_pred = dfK[dfK[y_pred] & dfK[y_test]].shape[0]
+    if sum_df > 0:
+        return true_pred / sum_df
+    else:
+        return None
+def recall_at_k(df, k=10, y_test='rating', y_pred='prediction'):
+    dfK = df.head(k)
+    sum_df = df[y_test].sum()
+    true_pred = dfK[dfK[y_pred] & dfK[y_test]].shape[0]
+    if sum_df > 0:
+        return true_pred / sum_df
+    else:
+        return None
+class CollaborativeModel(mlflow.pyfunc.PythonModel):
+    def fit(self, num_components=15, threshold=2):
+        """### explore datasets"""
+        rating = pd.read_csv('IMDB/ratings_small.csv')
+        movie = pd.read_csv('IMDB/movies_metadata.csv')
+        movie = movie.rename(columns={'id': 'movieId'})
+        """### data preprocessing
+        There are three rows entered by mistake, so we remove that row.
+        """
+        movie = movie[
+            (movie['movieId'] != '1997-08-20') & (movie['movieId'] != '2012-09-29') & (
+                    movie['movieId'] != '2014-01-01')]
+        def find_names(x):
+            if x == '':
+                return ''
+            genre_arr = eval(str(x))
+            return ','.join(i['name'] for i in eval(str(x)))
+        movie['genres'] = movie['genres'].fillna('')
+        movie['genres'] = movie['genres'].apply(find_names)
+        movie.movieId = movie.movieId.astype("uint64")
+        self.movie = movie
+        """only keep rating for movies with metadata in movie dataset"""
+        new_rating = pd.merge(rating, movie, how='inner', on=["movieId"])
+        new_rating = new_rating[["userId", "movieId", "rating"]]
+        self.new_rating = new_rating
+        """### matrix factorization"""
+        inter_mat_df = rating.pivot(index='userId', columns='movieId', values='rating').fillna(0)
+        inter_mat = inter_mat_df.to_numpy()
+        ratings_mean = np.mean(inter_mat, axis=1)
+        inter_mat_normal = inter_mat - ratings_mean.reshape(-1, 1)
+        """We use singular value decomposition for matrix factorization"""
+        svd_U, svd_sigma, svd_V = randomized_svd(inter_mat_normal,
+                                                 n_components=num_components,
+                                                 n_iter=5,
+                                                 random_state=47)
+        """This function gives the diagonal form"""
+        svd_sigma = np.diag(svd_sigma)
+        """Making predictions"""
+        rating_weights = np.dot(np.dot(svd_U, svd_sigma), svd_V) + ratings_mean.reshape(-1, 1)
+        self.weights_df = pd.DataFrame(rating_weights, columns=inter_mat_df.columns)
+    def predict(self, context, model_input):
+        return self.my_custom_function(model_input)
+    def my_custom_function(self, model_input):
+        # do something with the model input
+        self.fit(15, 2)
+        print(model_input)
+        self.user_rated, self.user_preds = recommend_top_k(self.weights_df, self.new_rating, self.movie,
+                                                           int(model_input), 100)
+        return self.user_preds
+    def eval_metrics(self):
+        df_res = self.user_preds[["movieId", "prediction"]]. \
+            merge(self.user_rated[["movieId", "rating"]], how='outer', on='movieId')
+        df_res.sort_values(by='prediction', ascending=False, inplace=True)
+        df_res['prediction'] = df_res['prediction'] >= threshold
+        df_res['rating'] = df_res['rating'] >= threshold
+        prec_at_k = precision_at_k(df_res, 100, y_test='rating', y_pred='prediction')
+        rec_at_k = recall_at_k(df_res, 100, y_test='rating', y_pred='prediction')
+        print("precision@k: ", prec_at_k)
+        print("recall@k: ", rec_at_k)
+        return prec_at_k, rec_at_k
+c_model = CollaborativeModel()
+with mlflow.start_run():
+    model_info = mlflow.pyfunc.log_model(artifact_path="model", python_model=c_model)
+    num_components = 15
+    threshold = 2
+    c_model.fit(num_components, threshold)
+    c_model.predict(context=pd.DataFrame([]), model_input=220)
+    prec_at_k, rec_at_k = c_model.eval_metrics()
+    mlflow.log_param("num_components", num_components)
+    mlflow.log_param("threshold", threshold)
+    mlflow.log_metric("precision_at_k", prec_at_k)
+    mlflow.log_metric("recall_at_k", rec_at_k)
+    print("Model saved in run %s" % mlflow.active_run().info.run_uuid)