RMakushkin commited on
Commit
e926a2d
1 Parent(s): a5457b3

Upload 4 files

Browse files
data/dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c10dbf7a899fbf0553bf6cab5fd11abf35cf224e4e6e4f7843fdd19144c550
3
+ size 19266108
data/embeddings_main.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b33d9e4726eff511c3f0f74dd9d1f22f863828aa0c03ff060c2983be3dce0115
3
+ size 45892736
data/faiss_index_main.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5fbaa50af8354c8a54372b1c763337f98792c351fa2e3aa266f448ec8266da2
3
+ size 45892653
data/func.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ from transformers import BertModel, BertTokenizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+
7
+
8
+ tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
9
+ model = BertModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
10
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
11
+
12
+
13
+ def filter_by_ganre(df: pd.DataFrame, ganre_list: list):
14
+ filtered_df = df[df['ganres'].apply(lambda x: any(g in ganre_list for g in(x)))]
15
+ filt_ind = filtered_df.index.to_list()
16
+ return filt_ind
17
+
18
+ # def mean_pooling(model_output, attention_mask):
19
+ # token_embeddings = model_output['last_hidden_state']
20
+ # input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
21
+ # sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
22
+ # sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
23
+ # return sum_embeddings / sum_mask
24
+
25
+ # def recommendation(filt_ind: list, embeddings: np.array, user_text: str, n=10):
26
+ # token_user_text = tokenizer(user_text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
27
+ # user_embeddings = torch.Tensor().to(device)
28
+ # model.to(device)
29
+ # model.eval()
30
+ # with torch.no_grad():
31
+ # batch = {k: v.to(device) for k, v in token_user_text.items()}
32
+ # outputs = model(**batch)
33
+ # user_embeddings = torch.cat([user_embeddings, mean_pooling(outputs, batch['attention_mask'])])
34
+ # user_embeddings = user_embeddings.cpu().numpy()
35
+ # cosine_similarities = cosine_similarity(embeddings[filt_ind], user_embeddings.reshape(1, -1))
36
+ # df_res = pd.DataFrame(cosine_similarities.ravel(), columns=['cos_sim']).sort_values('cos_sim', ascending=False)
37
+ # dict_topn = df_res.iloc[:n, :].cos_sim.to_dict()
38
+ # return dict_topn
39
+
40
+
41
+ def embed_user(filt_ind: list, embeddings:np.array, user_text: str, n=10):
42
+ tokens = tokenizer(user_text, return_tensors="pt", padding=True, truncation=True).to(device)
43
+ model.to(device)
44
+ model.eval()
45
+ with torch.no_grad():
46
+ outputs = model(**tokens)
47
+ user_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy().reshape(1, -1)
48
+ return user_embedding
49
+
50
+ # cosine_similarities = cosine_similarity(embeddings[filt_ind], user_embedding.reshape(1, -1))
51
+ # df_res = pd.DataFrame(cosine_similarities.ravel(), columns=['cos_sim']).sort_values('cos_sim', ascending=False)
52
+ # dict_topn = df_res.iloc[:n, :].cos_sim.to_dict()
53
+ # return dict_topn