import json import numpy as np import pandas as pd from sklearn.metrics.pairwise import cosine_similarity class Recommender : def __init__(self, id_col, title_col, content_col, title_vec_col, content_vec_col): self.title_vec_col = title_vec_col self.content_vec_col = content_vec_col self.title_col = title_col self.content_col = content_col self.id_col = id_col def calculate_recom_scores (self, k, similarities) : scores = list(enumerate(similarities[0])) scores = sorted(scores, key=lambda x: x[1], reverse=True) scores = scores[1: k + 1] return scores def str2arr (self, arr) : output = list() for string in arr : data_list = json.loads(string) # Convert the list to a NumPy array data_array = np.array(data_list) output.append(data_array) return np.array(output) # def recommend_k (self, table, k, id) : # data = np.array(list(zip(*table))) # # print(data.shape, data) # idx = int(data[0][data[self.id_col]==id].item()) # titles = self.str2arr(data[self.title_vec_col, :]) # contents = self.str2arr(data[self.content_vec_col, :]) # print(titles.shape) # print(titles[int(idx)].shape) # titles_sim = cosine_similarity(titles[int(idx)].reshape(1, -1), titles) # print(titles_sim.shape) # contents_sim = cosine_similarity(contents[int(idx)].reshape(1, -1), contents) # titles_scores = self.calculate_recom_scores(k, titles_sim) # contents_scores = self.calculate_recom_scores(k, contents_sim) # print(titles_scores) # union_scores = np.union1d(titles_scores, contents_scores) # print(type(union_scores)) # # union_scores = sorted(union_scores.tolist(), key=lambda x: x[1], reverse=True) # union_scores = sorted(union_scores.tolist(), key=lambda x: x[1] if isinstance(x, tuple) else x, reverse=True) # indices = [i[0] for i in union_scores] # result = data[:, np.isin(data[0,:], indices)] # return result[self.id_col, :].tolist() def recommend_k(self, table, k, title): data = np.array(list(zip(*table))) idx = np.where(data == title)[0].tolist()[0] titles = self.str2arr(data[self.title_vec_col, :]) contents = self.str2arr(data[self.content_vec_col, :]) titles_sim = cosine_similarity(titles[int(idx)].reshape(1, -1), titles) contents_sim = cosine_similarity(contents[int(idx)].reshape(1, -1), contents) titles_scores = self.calculate_recom_scores(k, titles_sim) contents_scores = self.calculate_recom_scores(k, contents_sim) # union_scores = np.union1d(titles_scores, contents_scores) union_scores = list(set(titles_scores).union(set(contents_scores))) union_scores = sorted(union_scores, key=lambda x: x[1] if isinstance(x, tuple) else x, reverse=True) # indices = [i[0] if isinstance(i, tuple) else i for i in union_scores] # result = data[:, np.isin(data[0, :], indices)] unique_dict = {} for t in union_scores: if t[0] not in unique_dict or t[1] > unique_dict[t[0]][1]: unique_dict[t[0]] = t union_scores = list(unique_dict.values()) indices = [i[0] for i in union_scores] titles = '\n'.join(table[self.title_col][indices].astype(str)) return indices, union_scores, titles