Spaces:
Sleeping
Sleeping
| import json | |
| import torch | |
| import nltk | |
| import benepar | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from utils.clean_text import clean_text | |
| from utils.semantic_similarity import Encoder | |
| from utils.syntactic_similarity import Parser | |
| from utils.tfidf_similarity import TFIDF_Vectorizer | |
| # Set default device to CUDA if available, otherwise CPU | |
| if torch.cuda.is_available(): | |
| torch.set_default_device("cuda") | |
| else: | |
| torch.set_default_device("cpu") | |
| # Download models/data | |
| nltk.download('punkt') | |
| nltk.download('punkt_tab') | |
| benepar.download('benepar_en3_large') | |
| # Load dataset | |
| data = pd.read_csv("data/toy_data_aggregated_embeddings.csv") | |
| # Load restaurant_by_source | |
| with open("data/restaurant_by_source.json", "r") as f: | |
| restaurant_by_source = json.load(f) | |
| # Load precomputed TF-IDF features | |
| restaurant_tfidf_features = np.load("data/toy_data_tfidf_features.npz") | |
| # Extract embeddings | |
| data["embedding"] = data["embedding"].apply( | |
| lambda x: np.fromstring(x.strip('[]'), sep=' ') | |
| ) | |
| all_desc_embeddings = np.vstack(data["embedding"].values) | |
| # Initialize encoder | |
| encoder = Encoder() | |
| # Initialize syntactic parser | |
| parser = Parser() | |
| # Initialize TF-IDF vectorizer | |
| tfidf_vectorizer = TFIDF_Vectorizer(load_vectorizer=True) | |
| def retrieve_candidates(query: str, n_candidates: int): | |
| # Encode query | |
| query_emb = encoder.encode([query]).cpu().numpy() | |
| # Semantic similarities | |
| desc_sem_sim = cosine_similarity(query_emb, all_desc_embeddings)[0] | |
| # TF-IDF similarities | |
| tfidf_sim = tfidf_vectorizer.compute_tfidf_scores(query, restaurant_tfidf_features) | |
| # Syntactic similarities | |
| parsed_query = parser.parse_text(query) | |
| parsed_query = parser.subtree_set(parsed_query) | |
| syn_sims = [] | |
| for trees_list in data["syntactic_tree"]: | |
| review_sims = [] | |
| for review_tree_subs in trees_list: | |
| if review_tree_subs is None: | |
| review_tree_subs = set() | |
| sim = parser.compute_syntactic_similarity(parsed_query, review_tree_subs) | |
| review_sims.append(sim) | |
| syn_sims.append(np.mean(review_sims)) | |
| # Combined Stage 1 score | |
| syn_sims = np.array(syn_sims) | |
| combined_stage1_scores = 0.8*desc_sem_sim + 0.1*syn_sims + 0.1*tfidf_sim | |
| # Get top N candidates for Stage 2 reranking | |
| candidates_idx = np.argsort(combined_stage1_scores)[-n_candidates:][::-1] | |
| return candidates_idx | |
| def rerank(candidates_idx: np.ndarray, n_rec: int = 10, data_source: str = None) -> list: | |
| # Get popularity scores for stage 1 candidates | |
| rerank_scores = data.loc[candidates_idx, "pop_score"].values | |
| # Retrieve n_rec restaurant based on pop_score | |
| topN_reranked_local_idx = np.argsort(rerank_scores)[-n_rec:][::-1] | |
| topN_reranked_global_idx = candidates_idx[topN_reranked_local_idx] | |
| # Get restaurant_id for final recommendations | |
| restaurant_ids = data.loc[topN_reranked_global_idx, "id"].tolist() | |
| return restaurant_ids | |
| def get_recommendations(query: str, n_candidates: int = 100, n_rec: int = 30, data_source: str = None): | |
| query_clean = clean_text(query) | |
| candidates_idx = retrieve_candidates(query_clean, n_candidates) | |
| restaurant_ids = rerank(candidates_idx, n_rec, data_source) | |
| return restaurant_ids | |