AmirShabani commited on
Commit
61faab2
1 Parent(s): a684e88

Fully pickled recomms

Browse files
Files changed (3) hide show
  1. core.py +9 -130
  2. requirements.txt +1 -4
  3. user_recomms.pkl +3 -0
core.py CHANGED
@@ -1,17 +1,3 @@
1
- import pip
2
-
3
- def install(package):
4
- if hasattr(pip, 'main'):
5
- pip.main(['install', package])
6
- else:
7
- pip._internal.main(['install', package])
8
-
9
- print("Everything goes bang.")
10
- install('torch_geometric')
11
- install('torch_scatter')
12
- install('torch_sparse')
13
- print("It's havoc baby!")
14
-
15
  import pickle
16
  import numpy as np
17
  import pandas as pd
@@ -19,112 +5,15 @@ import random
19
  from tqdm import tqdm
20
  import matplotlib.pyplot as plt
21
  from sklearn.model_selection import train_test_split
22
- import torch
23
- from torch import nn, optim, Tensor
24
- from torch_sparse import SparseTensor, matmul
25
- from torch_geometric.utils import structured_negative_sampling
26
- from torch_geometric.data import download_url, extract_zip
27
- from torch_geometric.nn.conv.gcn_conv import gcn_norm
28
- from torch_geometric.nn.conv import MessagePassing
29
- from torch_geometric.typing import Adj
30
  from sklearn.neighbors import BallTree
31
- from thefuzz import fuzz
32
- from thefuzz import process
33
-
34
- class LightGCN(MessagePassing):
35
- def __init__(self, num_users, num_items, embedding_dim=64, diffusion_steps=3, add_self_loops=False):
36
- super().__init__()
37
-
38
- # Number of users and items in the graph
39
- self.num_users = num_users
40
- self.num_items = num_items
41
-
42
- # Embedding dimension for user and item nodes
43
- self.embedding_dim = embedding_dim
44
-
45
- # Number of diffusion steps (K) for multi-scale diffusion
46
- self.diffusion_steps = diffusion_steps
47
-
48
- # Whether to add self-loops to the adjacency matrix
49
- self.add_self_loops = add_self_loops
50
-
51
- # Initialize embeddings for users and items (E^0)
52
- self.users_emb = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_dim) # e_u^0
53
- self.items_emb = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.embedding_dim) # e_i^0
54
-
55
- # Initialize embedding weights with a normal distribution (mean=0, std=0.1)
56
- nn.init.normal_(self.users_emb.weight, std=0.1)
57
- nn.init.normal_(self.items_emb.weight, std=0.1)
58
-
59
- def forward(self, edge_index: SparseTensor):
60
- # Compute the symmetrically normalized adjacency matrix (A_hat or \tilde{A})
61
- edge_index_norm = gcn_norm(edge_index, add_self_loops=self.add_self_loops)
62
-
63
- # Get initial embeddings E^0 for all nodes (users and items)
64
- emb_0 = torch.cat([self.users_emb.weight, self.items_emb.weight]) # E^0
65
-
66
- # List to store embeddings at each diffusion step (E^1, E^2, ..., E^K)
67
- embs = [emb_0]
68
-
69
- # Initialize the current embeddings to E^0
70
- emb_k = emb_0
71
-
72
- # Perform multi-scale diffusion for K steps
73
- for _ in range(self.diffusion_steps):
74
- # Propagate embeddings and update emb_k using the normalized adjacency matrix
75
- emb_k = self.propagate(edge_index_norm, x=emb_k)
76
- # Save embeddings at each diffusion step for later use
77
- embs.append(emb_k)
78
-
79
- # Stack all the embeddings along the second dimension (stack E^0, E^1, ..., E^K)
80
- embs = torch.stack(embs, dim=1)
81
-
82
- # Calculate the final embeddings by taking the mean of all diffusion embeddings (E^K)
83
- emb_final = torch.mean(embs, dim=1) # E^K
84
-
85
- # Split the final embeddings into user embeddings (e_u^K) and item embeddings (e_i^K)
86
- users_emb_final, items_emb_final = torch.split(emb_final, [self.num_users, self.num_items]) # Splits into e_u^K and e_i^K
87
-
88
- # Returns the final embeddings for users (e_u^K), initial embeddings for users (e_u^0),
89
- # final embeddings for items (e_i^K), and initial embeddings for items (e_i^0)
90
- return users_emb_final, self.users_emb.weight, items_emb_final, self.items_emb.weight
91
-
92
- def message(self, x_j: Tensor) -> Tensor:
93
- # The message function is an identity function, i.e., it returns x_j itself
94
- return x_j
95
-
96
- def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
97
- # Perform message passing and aggregation using the normalized adjacency matrix (A_hat or \tilde{A})
98
- return matmul(adj_t, x)
99
-
100
-
101
- model = LightGCN(671, 9125)
102
-
103
- def get_movie_recommendations(user_id, num_recomms):
104
- # Map the user ID to the corresponding index in the model's user embeddings
105
- user_index = user_mapping[user_id]
106
-
107
- # Retrieve the user embedding for the specified user
108
- user_embedding = model.users_emb.weight[user_index]
109
-
110
- # Calculate scores for all items using the user embedding
111
- scores = model.items_emb.weight @ user_embedding
112
-
113
- # Get the indices of the highest scores, including positive items and additional recommendations
114
- values, indices = torch.topk(scores, k=len(user_pos_items[user_id]) + num_recomms)
115
-
116
- # Retrieve the recommended movies that the user has already rated highly
117
- rated_movies = [index.cpu().item() for index in indices if index in user_pos_items[user_id]][:num_recomms]
118
- rated_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in rated_movies]
119
-
120
- # Retrieve the suggested movies for the user that they have not rated
121
- suggested_movies = [index.cpu().item() for index in indices if index not in user_pos_items[user_id]][:num_recomms]
122
- suggested_movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in suggested_movies]
123
-
124
- return rated_movie_ids, suggested_movie_ids
125
 
126
  addr = './'
127
 
 
 
 
 
 
128
  model.load_state_dict(torch.load(addr + 'model.pth'))
129
 
130
  final_movies_file = open(addr + 'final_movies.pkl', "rb")
@@ -184,23 +73,13 @@ def find_closest_user(user_embedding, tree, user_embeddings):
184
  return closest_user_embedding
185
 
186
 
187
- def drop_non_numerical_columns(df):
188
- non_numerical_columns = df.select_dtypes(exclude=[float, int]).columns
189
- return df.drop(columns=non_numerical_columns, inplace=False)
190
-
191
- def output_list(input_dict, movies_df = movie_embeds, tree = btree, user_embeddings = user_embeds, movies = final_movies):
192
- movie_ratings = {}
193
- for movie_title in input_dict:
194
- matching_title = process.extractOne(movie_title, final_movies['title'].values, scorer=fuzz.partial_token_sort_ratio)[0]
195
- index = movies.index[movies['title'] == matching_title].tolist()[0]
196
- movie_ratings[index] = input_dict[movie_title]
197
  user_embed = create_user_embedding(movie_ratings, movie_embeds)
198
  # Call the find_closest_user function with the pre-built BallTree
199
  closest_user_embed = find_closest_user(user_embed, tree, user_embeds)
200
- rated_movie_ids, suggested_movie_ids = get_movie_recommendations(closest_user_embed['userId'], 5)
201
- out1 = [movie_id for movie_id in set(rated_movie_ids + suggested_movie_ids) if movie_id not in movie_ratings.keys()]
202
- out2 = [movies['title'][idx] for idx in out1]
203
- return out2
204
 
205
  # output_list({1:1,2:2,3:3,4:4,5:5})
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pickle
2
  import numpy as np
3
  import pandas as pd
 
5
  from tqdm import tqdm
6
  import matplotlib.pyplot as plt
7
  from sklearn.model_selection import train_test_split
 
 
 
 
 
 
 
 
8
  from sklearn.neighbors import BallTree
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  addr = './'
11
 
12
+ user_recomms_file = open(addr + 'user_recomms.pkl', "rb")
13
+ user_recomms = pickle.load(user_recomms_file)
14
+ user_recomms_file.close()
15
+
16
+
17
  model.load_state_dict(torch.load(addr + 'model.pth'))
18
 
19
  final_movies_file = open(addr + 'final_movies.pkl', "rb")
 
73
  return closest_user_embedding
74
 
75
 
76
+ def output_list(movie_ratings, movies_df = movie_embeds, tree = btree, user_embeddings = user_embeds, movies = final_movies):
 
 
 
 
 
 
 
 
 
77
  user_embed = create_user_embedding(movie_ratings, movie_embeds)
78
  # Call the find_closest_user function with the pre-built BallTree
79
  closest_user_embed = find_closest_user(user_embed, tree, user_embeds)
80
+ recomms = user_recomms[int(closest_user_embed['userId'])]
81
+ out = [movies['title'].iloc[movie_id] for movie_id in recomms]
82
+ return out
 
83
 
84
  # output_list({1:1,2:2,3:3,4:4,5:5})
85
 
requirements.txt CHANGED
@@ -3,7 +3,4 @@ pillow
3
  numpy==1.23.5
4
  pandas==1.5.3
5
  thefuzz[speedup]
6
- scikit-learn==1.2.2
7
- torch==2.0.0
8
- torchvision==0.15.1
9
- torchaudio==2.0.1
 
3
  numpy==1.23.5
4
  pandas==1.5.3
5
  thefuzz[speedup]
6
+ scikit-learn==1.2.2
 
 
 
user_recomms.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6898b9039369a91e0aa792c09b6bbe8308b5c2d71a297152364bac31699cd60f
3
+ size 20177