Amir Hossein Advari
commited on
Commit
•
e3bbaeb
1
Parent(s):
e811a7a
Upload 16 files
Browse files- load-model.py +9 -0
- log-model.py +8 -0
- mlflow-dockerfile/Dockerfile +40 -0
- mlflow-dockerfile/model_dir/recommender-model/MLmodel +12 -0
- mlflow-dockerfile/model_dir/recommender-model/conda.yaml +16 -0
- mlflow-dockerfile/model_dir/recommender-model/python_env.yaml +7 -0
- mlflow-dockerfile/model_dir/recommender-model/python_model.pkl +3 -0
- mlflow-dockerfile/model_dir/recommender-model/requirements.txt +9 -0
- movie-recommender.ipynb +0 -0
- movie_recommender.py +312 -0
- recommender-model/MLmodel +12 -0
- recommender-model/conda.yaml +16 -0
- recommender-model/python_env.yaml +7 -0
- recommender-model/python_model.pkl +3 -0
- recommender-model/requirements.txt +9 -0
- save-model.py +6 -0
load-model.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import mlflow
|
2 |
+
|
3 |
+
model_path = "recommender-model"
|
4 |
+
|
5 |
+
loaded_model = mlflow.pyfunc.load_model(model_path)
|
6 |
+
print(loaded_model.predict([1, 1]))
|
7 |
+
print(loaded_model.predict([1, 2]))
|
8 |
+
print(loaded_model.predict([1, 3]))
|
9 |
+
print(loaded_model.predict([1, 4]))
|
log-model.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import mlflow
|
2 |
+
from movie_recommender import RecommenderSystemModel
|
3 |
+
|
4 |
+
model_path = "recommender-model"
|
5 |
+
|
6 |
+
mlflow.start_run()
|
7 |
+
mlflow.pyfunc.log_model(model_path, python_model=RecommenderSystemModel())
|
8 |
+
mlflow.end_run()
|
mlflow-dockerfile/Dockerfile
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Build an image that can serve mlflow models.
|
3 |
+
FROM ubuntu:20.04
|
4 |
+
|
5 |
+
RUN apt-get -y update
|
6 |
+
RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y --no-install-recommends wget curl nginx ca-certificates bzip2 build-essential cmake openjdk-8-jdk git-core maven && rm -rf /var/lib/apt/lists/*
|
7 |
+
|
8 |
+
|
9 |
+
# Setup miniconda
|
10 |
+
RUN curl -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh >> miniconda.sh
|
11 |
+
RUN bash ./miniconda.sh -b -p /miniconda && rm ./miniconda.sh
|
12 |
+
ENV PATH="/miniconda/bin:$PATH"
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
|
17 |
+
ENV GUNICORN_CMD_ARGS="--timeout 60 -k gevent"
|
18 |
+
# Set up the program in the image
|
19 |
+
WORKDIR /opt/mlflow
|
20 |
+
|
21 |
+
RUN pip install mlflow==2.1.1
|
22 |
+
RUN mvn --batch-mode dependency:copy -Dartifact=org.mlflow:mlflow-scoring:2.1.1:pom -DoutputDirectory=/opt/java
|
23 |
+
RUN mvn --batch-mode dependency:copy -Dartifact=org.mlflow:mlflow-scoring:2.1.1:jar -DoutputDirectory=/opt/java/jars
|
24 |
+
RUN cp /opt/java/mlflow-scoring-2.1.1.pom /opt/java/pom.xml
|
25 |
+
RUN cd /opt/java && mvn --batch-mode dependency:copy-dependencies -DoutputDirectory=/opt/java/jars
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
COPY model_dir/recommender-model /opt/ml/model
|
30 |
+
RUN python -c 'from mlflow.models.container import _install_pyfunc_deps; _install_pyfunc_deps( "/opt/ml/model", install_mlflow=False, enable_mlserver=False, env_manager="conda")'
|
31 |
+
ENV MLFLOW_DISABLE_ENV_CREATION="true"
|
32 |
+
ENV ENABLE_MLSERVER=False
|
33 |
+
|
34 |
+
|
35 |
+
# granting read/write access and conditional execution authority to all child directories
|
36 |
+
# and files to allow for deployment to AWS Sagemaker Serverless Endpoints
|
37 |
+
# (see https://docs.aws.amazon.com/sagemaker/latest/dg/serverless-endpoints.html)
|
38 |
+
RUN chmod o+rwX /opt/mlflow/
|
39 |
+
|
40 |
+
ENTRYPOINT ["python", "-c", "from mlflow.models import container as C;C._serve('conda')"]
|
mlflow-dockerfile/model_dir/recommender-model/MLmodel
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
flavors:
|
2 |
+
python_function:
|
3 |
+
cloudpickle_version: 2.2.1
|
4 |
+
env:
|
5 |
+
conda: conda.yaml
|
6 |
+
virtualenv: python_env.yaml
|
7 |
+
loader_module: mlflow.pyfunc.model
|
8 |
+
python_model: python_model.pkl
|
9 |
+
python_version: 3.10.6
|
10 |
+
mlflow_version: 2.1.1
|
11 |
+
model_uuid: 781a5cfec6964e7098c03302c14d6f67
|
12 |
+
utc_time_created: '2023-01-20 22:41:17.799682'
|
mlflow-dockerfile/model_dir/recommender-model/conda.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
channels:
|
2 |
+
- conda-forge
|
3 |
+
dependencies:
|
4 |
+
- python=3.10.6
|
5 |
+
- pip<=22.3.1
|
6 |
+
- pip:
|
7 |
+
- mlflow<3,>=2.1
|
8 |
+
- astunparse==1.6.3
|
9 |
+
- cffi==1.15.1
|
10 |
+
- cloudpickle==2.2.1
|
11 |
+
- defusedxml==0.7.1
|
12 |
+
- ipython==8.7.0
|
13 |
+
- opt-einsum==3.3.0
|
14 |
+
- psutil==5.9.4
|
15 |
+
- torch==1.13.1
|
16 |
+
name: mlflow-env
|
mlflow-dockerfile/model_dir/recommender-model/python_env.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python: 3.10.6
|
2 |
+
build_dependencies:
|
3 |
+
- pip==22.3.1
|
4 |
+
- setuptools==65.7.0
|
5 |
+
- wheel==0.38.4
|
6 |
+
dependencies:
|
7 |
+
- -r requirements.txt
|
mlflow-dockerfile/model_dir/recommender-model/python_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9eade417b5d023b0cb4d5039565ec1bf679f564a92903a4d721f2d4cf8d1eebb
|
3 |
+
size 15534
|
mlflow-dockerfile/model_dir/recommender-model/requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mlflow<3,>=2.1
|
2 |
+
astunparse==1.6.3
|
3 |
+
cffi==1.15.1
|
4 |
+
cloudpickle==2.2.1
|
5 |
+
defusedxml==0.7.1
|
6 |
+
ipython==8.7.0
|
7 |
+
opt-einsum==3.3.0
|
8 |
+
psutil==5.9.4
|
9 |
+
torch==1.13.1
|
movie-recommender.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
movie_recommender.py
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
import mlflow
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
+
import itertools
|
9 |
+
import mlflow
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
|
12 |
+
import pandas as pd
|
13 |
+
import numpy as np
|
14 |
+
import torch
|
15 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
16 |
+
|
17 |
+
|
18 |
+
############################################################ Content Base ############################################################
|
19 |
+
class ContentBasedRecommendation():
|
20 |
+
def __init__(self):
|
21 |
+
self.credits_ds, self.links_ds, self.ratings_ds, self.keywords_ds, self.movies_metadata_ds = self.load_datasets()
|
22 |
+
self.df = self.process_datasets(self.movies_metadata_ds, self.credits_ds, self.keywords_ds)
|
23 |
+
self.vectorized_data = self.vectorize_data(self.df)
|
24 |
+
self.similarity = self.calculate_similarity(self.vectorized_data)
|
25 |
+
|
26 |
+
def load_datasets(self):
|
27 |
+
credits_ds = pd.read_csv('./dataset/IMDB/credits.csv')
|
28 |
+
links_ds = pd.read_csv('./dataset/IMDB/links_small.csv')
|
29 |
+
ratings_ds = pd.read_csv('./dataset/IMDB/ratings_small.csv')
|
30 |
+
keywords_ds = pd.read_csv('./dataset/IMDB/keywords.csv')
|
31 |
+
movies_metadata_ds = pd.read_csv('./dataset/IMDB/movies_metadata.csv')
|
32 |
+
return credits_ds, links_ds, ratings_ds, keywords_ds, movies_metadata_ds
|
33 |
+
|
34 |
+
def process_movies_metadata(self, movies_metadata_ds):
|
35 |
+
mlflow.log_param("Movies_Metadata Shape Before Data Cleaning", movies_metadata_ds.shape)
|
36 |
+
mlflow.log_param("Movies_Metadata Column Before Cleaning", movies_metadata_ds.columns)
|
37 |
+
movies_metadata_ds = movies_metadata_ds[movies_metadata_ds['status'] == 'Released']
|
38 |
+
movies_metadata_ds = movies_metadata_ds[movies_metadata_ds['vote_count'] > 40]
|
39 |
+
movies_metadata_ds = movies_metadata_ds[movies_metadata_ds['vote_average'] >= 5]
|
40 |
+
important_col = ['id', 'genres', 'overview', 'original_title', 'belongs_to_collection']
|
41 |
+
movies_metadata_ds = movies_metadata_ds[important_col]
|
42 |
+
movies_metadata_ds.reset_index(inplace=True, drop=True)
|
43 |
+
movies_metadata_ds['genres'] = movies_metadata_ds['genres'].apply(
|
44 |
+
lambda x: ' '.join([i['name'].lower().replace(' ', '') for i in eval(x)]))
|
45 |
+
movies_metadata_ds['belongs_to_collection'] = movies_metadata_ds['belongs_to_collection'].apply(
|
46 |
+
lambda x: eval(str(x))['name'].lower().replace(' ', '') if str(x).lower() != 'nan' else '')
|
47 |
+
movies_metadata_ds = movies_metadata_ds[movies_metadata_ds['id'].str.isnumeric()]
|
48 |
+
movies_metadata_ds['id'] = movies_metadata_ds['id'].astype(int)
|
49 |
+
mlflow.log_param("Movies_Metadata Shape After Data Cleaning", movies_metadata_ds.shape)
|
50 |
+
mlflow.log_param("Movies_Metadata Column After Cleaning", movies_metadata_ds.columns)
|
51 |
+
return movies_metadata_ds
|
52 |
+
|
53 |
+
def process_credits(self, credits_ds):
|
54 |
+
mlflow.log_param("Credits-Dataset Shape Before Data Cleaning", credits_ds.shape)
|
55 |
+
mlflow.log_param("Credits-Dataset Columns Before Cleaning", credits_ds.columns)
|
56 |
+
credits_ds['cast'] = credits_ds['cast'].apply(
|
57 |
+
lambda x: ' '.join([i['name'].lower().replace(' ', '') for i in eval(x)]))
|
58 |
+
|
59 |
+
credits_ds['crew'] = credits_ds['crew'].apply(
|
60 |
+
lambda x: [i['name'].lower().replace(' ', '') if i['job'] == 'Director' else '' for i in eval(x)])
|
61 |
+
credits_ds['crew'] = credits_ds['crew'].apply(lambda x: ' '.join([i for i in x if i != '']))
|
62 |
+
credits_ds['cast'] = credits_ds.apply(lambda x: x.loc['cast'] + ' ' + x.loc['crew'], axis=1)
|
63 |
+
credits_ds = credits_ds[['id', 'cast']]
|
64 |
+
credits_ds.reset_index(inplace=True, drop=True)
|
65 |
+
mlflow.log_param("Credits-Dataset Shape after Data Cleaning", credits_ds.shape)
|
66 |
+
mlflow.log_param("Credits-Dataset Columns after Cleaning", credits_ds.columns)
|
67 |
+
return credits_ds
|
68 |
+
|
69 |
+
def process_keywords(self, keywords_ds):
|
70 |
+
keywords_ds['keywords'] = keywords_ds['keywords'].apply(
|
71 |
+
lambda x: ' '.join([i['name'].lower().replace(' ', '') for i in eval(x)]))
|
72 |
+
return keywords_ds
|
73 |
+
|
74 |
+
def make_general_df(self, movies_metadata_ds, credits_ds, keywords_ds):
|
75 |
+
df = pd.merge(movies_metadata_ds, keywords_ds, on='id', how='left')
|
76 |
+
df = pd.merge(df, credits_ds, on='id', how='left')
|
77 |
+
df.reset_index(inplace=True)
|
78 |
+
df.drop(columns=['index'], inplace=True)
|
79 |
+
return df
|
80 |
+
|
81 |
+
def clean_general_df(self, df):
|
82 |
+
col = list(df.columns)
|
83 |
+
col.remove('id')
|
84 |
+
col.remove('genres')
|
85 |
+
col.remove('original_title')
|
86 |
+
df['title'] = df['original_title']
|
87 |
+
df['token'] = df['genres']
|
88 |
+
for i in col:
|
89 |
+
df['token'] = df['token'] + ' ' + df[i]
|
90 |
+
df = df[['id', 'title', 'token']]
|
91 |
+
df.drop(df[df['token'].isnull()].index, inplace=True)
|
92 |
+
mlflow.log_param("Merged Dataset Shape", df.shape)
|
93 |
+
mlflow.log_param("Merged Dataset Columns", df.columns)
|
94 |
+
return df
|
95 |
+
|
96 |
+
def process_datasets(self, movies_metadata_ds, credits_ds, keywords_ds):
|
97 |
+
movies_metadata_ds = self.process_movies_metadata(movies_metadata_ds)
|
98 |
+
credits_ds = self.process_credits(credits_ds)
|
99 |
+
keywords_ds = self.process_keywords(keywords_ds)
|
100 |
+
df = self.make_general_df(movies_metadata_ds, credits_ds, keywords_ds)
|
101 |
+
df = self.clean_general_df(df)
|
102 |
+
return df
|
103 |
+
|
104 |
+
def vectorize_data(self, df, MAX_FEATURES=5000):
|
105 |
+
mlflow.log_metric("MAX_FEATURES in vectorizing tags column", MAX_FEATURES)
|
106 |
+
tfidf = TfidfVectorizer(max_features=MAX_FEATURES)
|
107 |
+
vectorized_data = tfidf.fit_transform(df['token'].values)
|
108 |
+
return vectorized_data
|
109 |
+
|
110 |
+
def calculate_similarity(self, vectorized_data):
|
111 |
+
similarity = cosine_similarity(vectorized_data)
|
112 |
+
mlflow.log_param("Movies-Similarity", similarity)
|
113 |
+
return similarity
|
114 |
+
|
115 |
+
def content_recommendation_by_movie(self, df, similarity, title, number=20):
|
116 |
+
if len(df[df['title'] == title]) == 0:
|
117 |
+
return []
|
118 |
+
movie_id = df[df['title'] == title].index[0]
|
119 |
+
distances = similarity[movie_id]
|
120 |
+
|
121 |
+
fig, ax = plt.subplots()
|
122 |
+
ax.plot(sorted(distances[:number], reverse=True))
|
123 |
+
plt.title("similarities")
|
124 |
+
plt.savefig("similarities.png")
|
125 |
+
mlflow.log_figure(fig, "similarities.png")
|
126 |
+
plt.close()
|
127 |
+
|
128 |
+
movies = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])
|
129 |
+
return [df.iloc[i[0]].title for i in movies[:number]]
|
130 |
+
|
131 |
+
def content_recommendation_by_user(self, df, ratings_ds, similarity, user_id, number=20):
|
132 |
+
user_rate_ds = ratings_ds[ratings_ds['userId'] == user_id]
|
133 |
+
sort = user_rate_ds.sort_values(by='rating', ascending=False)
|
134 |
+
movie_id = sort['movieId']
|
135 |
+
movie_list = [df[df['id'] == id]['title'].values[0] for id in movie_id if len(df[df['id'] == id]['title']) > 0]
|
136 |
+
result = [self.content_recommendation_by_movie(df, similarity, str(title)) for title in movie_list]
|
137 |
+
return list(itertools.chain.from_iterable(result))
|
138 |
+
|
139 |
+
def predict_by_movie(self, title):
|
140 |
+
recommendations = self.content_recommendation_by_movie(self.df, self.similarity, title)
|
141 |
+
return recommendations
|
142 |
+
|
143 |
+
def predict(self, user_id):
|
144 |
+
recommendations = self.content_recommendation_by_user(self.df, self.ratings_ds, self.similarity, user_id)
|
145 |
+
return recommendations
|
146 |
+
|
147 |
+
def local_content_base_test(self):
|
148 |
+
print(self.predict_by_movie('Toy Story'))
|
149 |
+
print('***************************************************************************')
|
150 |
+
print(self.predict_by_movie('Jumanji'))
|
151 |
+
print('***************************************************************************')
|
152 |
+
print(self.predict_by_movie('Rocky III'))
|
153 |
+
print('***************************************************************************')
|
154 |
+
print(self.predict(1))
|
155 |
+
|
156 |
+
|
157 |
+
############################################################ Collaborative ############################################################
|
158 |
+
class CollaborativeRecommendation():
|
159 |
+
def __init__(self):
|
160 |
+
self.movie_df, self.rate_df = self.load_dataframes()
|
161 |
+
self.sparse_matrix = self.make_sparse_matrix(self.rate_df)
|
162 |
+
self.similarities_sparse = self.make_similarity_sparse(self.sparse_matrix)
|
163 |
+
|
164 |
+
def load_dataframes(self):
|
165 |
+
movie_df = pd.read_csv('./dataset/IMDB/movies_metadata.csv')
|
166 |
+
rate_df = pd.read_csv('./dataset/IMDB/ratings_small.csv')
|
167 |
+
return movie_df, rate_df
|
168 |
+
|
169 |
+
def make_sparse_matrix(self, rate_df):
|
170 |
+
user_max = rate_df['userId'].max()
|
171 |
+
movie_max = rate_df['movieId'].max()
|
172 |
+
i = torch.LongTensor(rate_df[['userId', 'movieId']].to_numpy())
|
173 |
+
v = torch.FloatTensor(rate_df[['rating']].to_numpy().flatten())
|
174 |
+
sparse_matrix = torch.sparse.FloatTensor(i.t(), v, torch.Size([user_max + 1, movie_max + 1])).to_dense()
|
175 |
+
return sparse_matrix
|
176 |
+
|
177 |
+
def make_similarity_sparse(self, sparse_matrix):
|
178 |
+
similarities_sparse = cosine_similarity(sparse_matrix, dense_output=False)
|
179 |
+
mlflow.log_param("users similarity sparse matrix", similarities_sparse)
|
180 |
+
return similarities_sparse
|
181 |
+
|
182 |
+
def top_n_index_sparse(self, similarities_sparse, user_id, number=20):
|
183 |
+
user_row = similarities_sparse[user_id]
|
184 |
+
|
185 |
+
fig, ax = plt.subplots()
|
186 |
+
ax.plot(list(sorted(user_row, reverse=True))[:number])
|
187 |
+
plt.title("users-similarities")
|
188 |
+
plt.savefig("users-similarities.png")
|
189 |
+
mlflow.log_figure(fig, "users-similarities.png")
|
190 |
+
plt.close()
|
191 |
+
|
192 |
+
user_details = list(map(lambda x: (x[0], x[1]), enumerate(user_row)))
|
193 |
+
sort = list(sorted(user_details, key=lambda x: x[1], reverse=True))
|
194 |
+
# removing user itself
|
195 |
+
sort = sort[1:]
|
196 |
+
result = list(map(lambda x: x[0], sort[:number]))
|
197 |
+
return result
|
198 |
+
|
199 |
+
def user_top_movies(self, rate_df, user_id, number=10):
|
200 |
+
user_rate = rate_df[rate_df['userId'] == user_id]
|
201 |
+
sort = user_rate.sort_values(by='rating', ascending=False)
|
202 |
+
number = number if number <= len(sort) else len(sort)
|
203 |
+
result = sort['movieId'].values[:number]
|
204 |
+
return result
|
205 |
+
|
206 |
+
def recommendation_for_user(self, movie_df, rate_df, similarities_sparse, user_id, number=20):
|
207 |
+
similar_users = self.top_n_index_sparse(similarities_sparse, user_id)
|
208 |
+
movies = []
|
209 |
+
for i in similar_users:
|
210 |
+
similar_user_movies = self.user_top_movies(rate_df, i)
|
211 |
+
[movies.append(j) for j in similar_user_movies]
|
212 |
+
temp = rate_df[rate_df['userId'] == user_id]
|
213 |
+
for i in movies:
|
214 |
+
if len(temp[temp['movieId'] == i]) > 0:
|
215 |
+
movies.remove(i)
|
216 |
+
titles = [movie_df[movie_df['id'] == str(id)]['title'].values[0] for id in movies if
|
217 |
+
len(movie_df[movie_df['id'] == str(id)]['title']) > 0]
|
218 |
+
number = number if number < len(titles) else len(titles)
|
219 |
+
return titles[:number]
|
220 |
+
|
221 |
+
def predict(self, user_id):
|
222 |
+
recommendations = self.recommendation_for_user(self.movie_df, self.rate_df, self.similarities_sparse, user_id)
|
223 |
+
return recommendations
|
224 |
+
|
225 |
+
def local_collaborative_test(self):
|
226 |
+
print(self.recommendation_for_user(self.movie_df, self.rate_df, self.similarities_sparse, 1))
|
227 |
+
|
228 |
+
|
229 |
+
############################################################ Ensemble ############################################################
|
230 |
+
class EnsembleRecommendation():
|
231 |
+
def __init__(self):
|
232 |
+
self.collab = CollaborativeRecommendation()
|
233 |
+
self.content = ContentBasedRecommendation()
|
234 |
+
|
235 |
+
def ensemble_recommendation_intersection_based(self, user_id, number=10):
|
236 |
+
collaborative = self.collab.predict(user_id)
|
237 |
+
content_based = self.content.predict(user_id)
|
238 |
+
result = list(set(collaborative) & set(content_based)) # finding intersect
|
239 |
+
for i in result:
|
240 |
+
collaborative.remove(i)
|
241 |
+
content_based.remove(i)
|
242 |
+
collaborative_index = 0
|
243 |
+
content_base_index = 0
|
244 |
+
while len(result) < number:
|
245 |
+
if collaborative_index > content_base_index:
|
246 |
+
result.append(content_based[content_base_index])
|
247 |
+
content_base_index = content_base_index + 1
|
248 |
+
else:
|
249 |
+
result.append(collaborative[collaborative_index])
|
250 |
+
collaborative_index = collaborative_index + 1
|
251 |
+
return result
|
252 |
+
|
253 |
+
def ensemble_recommendation_collaborative_based(self, user_id, number=10):
|
254 |
+
collaborative = self.collab.predict(user_id)
|
255 |
+
results = []
|
256 |
+
for movie in collaborative:
|
257 |
+
recommended_movies = self.content.predict_by_movie(movie)
|
258 |
+
for i in recommended_movies:
|
259 |
+
results.append(i) if i not in results else None
|
260 |
+
return results[:number]
|
261 |
+
|
262 |
+
def predict(self, user_id, intersection_base=True):
|
263 |
+
if intersection_base:
|
264 |
+
return self.ensemble_recommendation_intersection_based(user_id, number=10)
|
265 |
+
else:
|
266 |
+
return self.ensemble_recommendation_collaborative_based(user_id, number=10)
|
267 |
+
|
268 |
+
def local_test(self):
|
269 |
+
print(self.predict(1, intersection_base=True))
|
270 |
+
print('***************************************************************************')
|
271 |
+
print(self.predict(1, intersection_base=False))
|
272 |
+
|
273 |
+
|
274 |
+
############################################################ Testing Models ############################################################
|
275 |
+
# warnings.filterwarnings("ignore")
|
276 |
+
# ensemble = EnsembleRecommendation()
|
277 |
+
# ensemble.content.local_content_base_test()
|
278 |
+
# print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
|
279 |
+
# ensemble.collab.local_collaborative_test()
|
280 |
+
# print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
|
281 |
+
# ensemble.local_test()
|
282 |
+
############################################################ MLOps Model ############################################################
|
283 |
+
class RecommenderSystemModel(mlflow.pyfunc.PythonModel):
|
284 |
+
|
285 |
+
def load_context(self, context):
|
286 |
+
self.ensemble = EnsembleRecommendation()
|
287 |
+
self.collab = self.ensemble.collab
|
288 |
+
self.content = self.ensemble.content
|
289 |
+
|
290 |
+
def predict(self, context, model_input):
|
291 |
+
return self.my_custom_function(model_input)
|
292 |
+
|
293 |
+
def my_custom_function(self, model_input):
|
294 |
+
user_id = model_input[0]
|
295 |
+
type = model_input[1]
|
296 |
+
if type == 1:
|
297 |
+
return self.content.predict(user_id)
|
298 |
+
if type == 2:
|
299 |
+
return self.collab.predict(user_id)
|
300 |
+
if type == 3:
|
301 |
+
return self.ensemble.predict(user_id, intersection_base=False)
|
302 |
+
if type == 4:
|
303 |
+
return self.ensemble.predict(user_id, intersection_base=True)
|
304 |
+
return 0
|
305 |
+
|
306 |
+
|
307 |
+
|
308 |
+
# sending request command
|
309 |
+
## mlflow models serve -m recommender-model -p 6000
|
310 |
+
# curl http://127.0.0.1:5000/invocations -H 'Content-Type: application/json' -d '{
|
311 |
+
# "inputs": [1,1]
|
312 |
+
# }'
|
recommender-model/MLmodel
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
flavors:
|
2 |
+
python_function:
|
3 |
+
cloudpickle_version: 2.2.1
|
4 |
+
env:
|
5 |
+
conda: conda.yaml
|
6 |
+
virtualenv: python_env.yaml
|
7 |
+
loader_module: mlflow.pyfunc.model
|
8 |
+
python_model: python_model.pkl
|
9 |
+
python_version: 3.10.6
|
10 |
+
mlflow_version: 2.1.1
|
11 |
+
model_uuid: 781a5cfec6964e7098c03302c14d6f67
|
12 |
+
utc_time_created: '2023-01-20 22:41:17.799682'
|
recommender-model/conda.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
channels:
|
2 |
+
- conda-forge
|
3 |
+
dependencies:
|
4 |
+
- python=3.10.6
|
5 |
+
- pip<=22.3.1
|
6 |
+
- pip:
|
7 |
+
- mlflow<3,>=2.1
|
8 |
+
- astunparse==1.6.3
|
9 |
+
- cffi==1.15.1
|
10 |
+
- cloudpickle==2.2.1
|
11 |
+
- defusedxml==0.7.1
|
12 |
+
- ipython==8.7.0
|
13 |
+
- opt-einsum==3.3.0
|
14 |
+
- psutil==5.9.4
|
15 |
+
- torch==1.13.1
|
16 |
+
name: mlflow-env
|
recommender-model/python_env.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python: 3.10.6
|
2 |
+
build_dependencies:
|
3 |
+
- pip==22.3.1
|
4 |
+
- setuptools==65.7.0
|
5 |
+
- wheel==0.38.4
|
6 |
+
dependencies:
|
7 |
+
- -r requirements.txt
|
recommender-model/python_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9eade417b5d023b0cb4d5039565ec1bf679f564a92903a4d721f2d4cf8d1eebb
|
3 |
+
size 15534
|
recommender-model/requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mlflow<3,>=2.1
|
2 |
+
astunparse==1.6.3
|
3 |
+
cffi==1.15.1
|
4 |
+
cloudpickle==2.2.1
|
5 |
+
defusedxml==0.7.1
|
6 |
+
ipython==8.7.0
|
7 |
+
opt-einsum==3.3.0
|
8 |
+
psutil==5.9.4
|
9 |
+
torch==1.13.1
|
save-model.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import mlflow
|
2 |
+
from movie_recommender import RecommenderSystemModel
|
3 |
+
|
4 |
+
model_path = "recommender-model"
|
5 |
+
|
6 |
+
mlflow.pyfunc.save_model(path=model_path, python_model=RecommenderSystemModel())
|