Spaces:

knguyen471
/

team-149-project

Running

App Files Files Community

knguyen471 commited on 18 days ago

Commit

888aba6

verified ·

1 Parent(s): a2172cf

Upload 11 files

Browse files

Files changed (11) hide show

README.md +33 -5
app.py +304 -0
data/tfidf_vectorizer.pkl +3 -0
main.py +94 -0
requirements.txt +11 -0
utils/.DS_Store +0 -0
utils/clean_text.py +28 -0
utils/ranker.py +23 -0
utils/semantic_similarity.py +25 -0
utils/syntactic_similarity.py +73 -0
utils/tfidf_similarity.py +34 -0

README.md CHANGED Viewed

@@ -1,12 +1,40 @@
 ---
-title: Team 149 Project
-emoji: 📈
-colorFrom: blue
-colorTo: blue
 sdk: gradio
 sdk_version: 6.0.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Team 149 Project 2
+emoji: 🐠
+colorFrom: indigo
+colorTo: red
 sdk: gradio
 sdk_version: 6.0.0
 app_file: app.py
 pinned: false
 ---
+# Restaurant Recommendation System - UI
+A web-based interface for searching and discovering restaurants in Paris with natural language search, interactive map visualization, and popularity ranking.
+## Features
+- Natural language search for restaurants
+- Interactive Paris map with color-coded rating indicators
+- Bayesian popularity ranking
+- Semantic and keyword search options
+- Database of 5,277+ restaurants
+## Installation
+### Install Dependencies
+```bash
+python -m venv team-149-project
+source team-149-project/bin/activate # On Windows: team-149-project\Scripts\activate
+pip install -r requirements.txt --no-cache-dir
+```
+### Run Application
+```bash
+python demo_app_advanced.py
+```
+Application launches at `http://127.0.0.1:7860`

app.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import folium
+import sys
+import os
+# Add utils to path
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), 'utils'))
+from clean_text import clean_text
+from semantic_similarity import Encoder
+from ranker import compute_bayesian_popularity_score
+from main import get_recommendations
+print("Loading restaurant data...")
+data = pd.read_csv("../data/toy_data_aggregated_embeddings.csv")
+print(f"Loaded {len(data)} restaurants")
+# Compute Bayesian popularity scores
+print("Computing popularity scores...")
+data = compute_bayesian_popularity_score(data)
+print("Popularity scores computed")
+print("Loading pre-computed embeddings...")
+all_desc_embeddings = np.vstack(data["embedding"].values)
+print(f"Loaded embeddings with shape {all_desc_embeddings.shape}")
+# Initialize semantic encoder
+print("Loading semantic encoder model...")
+try:
+    encoder = Encoder()
+    print("Semantic encoder loaded")
+except Exception as e:
+    print(f"Warning: Could not load semantic encoder: {e}")
+    print("Falling back to keyword-only search")
+def create_paris_map(results_df):
+    """Create interactive map of Paris restaurants"""
+    paris_center = [48.8566, 2.3522]
+    m = folium.Map(location=paris_center, zoom_start=12, tiles='OpenStreetMap')
+    for idx, row in results_df.iterrows():
+        lat_offset = np.random.uniform(-0.05, 0.05)
+        lng_offset = np.random.uniform(-0.07, 0.07)
+        coords = [48.8566 + lat_offset, 2.3522 + lng_offset]
+        rating = float(row.get('overall_rating', 0))
+        color = 'green' if rating >= 4.5 else 'blue' if rating >= 4.0 else 'orange' if rating >= 3.5 else 'red'
+        popup_html = f"""
+        <div style="width:250px">
+            <h4><b>{row['name']}</b></h4>
+            <p>Rating: {row.get('overall_rating', 'N/A')}</p>
+            <p>Reviews: {row.get('review_count', 'N/A')}</p>
+            <p>Popularity Score: {row.get('pop_score', 'N/A'):.2f}</p>
+        </div>
+        """
+        folium.Marker(
+            location=coords,
+            popup=folium.Popup(popup_html, max_width=300),
+            icon=folium.Icon(color=color, icon='cutlery', prefix='fa')
+        ).add_to(m)
+    return m._repr_html_()
+# def semantic_search(query, data_source, num_results, use_popularity):
+#     """Semantic search using embeddings"""
+#     if not query.strip():
+#         return "Please enter a search query", None
+#     try:
+#         query_clean = clean_text(query)
+#         # Generate query embedding
+#         print(f"Encoding query: {query_clean}")
+#         query_embedding = encoder.encode([query_clean], show_progress_bar=False)
+#         query_embedding = query_embedding.cpu().numpy()
+#         # Compute semantic similarity
+#         similarities = cosine_similarity(query_embedding, all_desc_embeddings)[0]
+#         # Combine with popularity if requested
+#         if use_popularity:
+#             sim_normalized = (similarities - similarities.min()) / (similarities.max() - similarities.min() + 1e-10)
+#             pop_normalized = (data["pop_score"] - data["pop_score"].min()) / (data["pop_score"].max() - data["pop_score"].min() + 1e-10)
+#             # Combined score: 70% semantic, 30% popularity
+#             scores = 0.7 * sim_normalized + 0.3 * pop_normalized
+#         else:
+#             scores = similarities
+#         top_indices = np.argsort(scores)[-int(num_results):][::-1]
+#         results = data.iloc[top_indices].copy()
+#         results['similarity_score'] = scores[top_indices]
+#         map_html = create_paris_map(results)
+#         output = f"Found {len(results)} restaurants for '{query}'\n"
+#         output += f"Data Source: {data_source}\n"
+#         output += f"Search Method: Semantic Search {'+ Popularity' if use_popularity else ''}\n\n"
+#         for idx, (_, row) in enumerate(results.iterrows(), 1):
+#             name = row.get('name', 'Unknown')
+#             rating = row.get('overall_rating', 'N/A')
+#             reviews = row.get('review_count', 'N/A')
+#             similarity = row.get('similarity_score', 0)
+#             pop_score = row.get('pop_score', 0)
+#             output += f"{idx}. **{name}**\n"
+#             output += f"   Rating: {rating} | Reviews: {reviews}\n"
+#             output += f"   Match: {similarity:.3f}"
+#             if use_popularity:
+#                 output += f" | Popularity: {pop_score:.2f}"
+#             output += "\n"
+#             if 'address' in row and pd.notna(row['address']):
+#                 addr = str(row['address'])[:100]
+#                 output += f"   Address: {addr}\n"
+#             output += "\n"
+#         return output, map_html
+#     except Exception as e:
+#         import traceback
+#         return f"Error: {str(e)}\n\n{traceback.format_exc()}", None
+# def keyword_search(query, data_source, num_results, use_popularity):
+#     """Keyword-based search with optional popularity ranking"""
+#     if not query.strip():
+#         return "Please enter a search query", None
+#     try:
+#         query_clean = clean_text(query).lower()
+#         query_words = set(query_clean.split())
+#         scores = []
+#         for idx, row in data.iterrows():
+#             score = 0
+#             name = str(row.get('name', '')).lower()
+#             # Check name matches
+#             for word in query_words:
+#                 if word in name:
+#                     score += 2
+#             rating = float(row.get('overall_rating', 0))
+#             score += rating * 0.5
+#             # Add popularity if requested
+#             if use_popularity:
+#                 pop_score = float(row.get('pop_score', 0))
+#                 score += pop_score * 0.3
+#             scores.append(score)
+#         top_indices = np.argsort(scores)[-int(num_results):][::-1]
+#         results = data.iloc[top_indices].copy()
+#         results['match_score'] = [scores[i] for i in top_indices]
+#         map_html = create_paris_map(results)
+#         output = f"Found {len(results)} restaurants for '{query}'\n"
+#         output += f"Data Source: {data_source}\n"
+#         output += f"Search Method: Keyword Search {'+ Popularity' if use_popularity else ''}\n\n"
+#         for idx, (_, row) in enumerate(results.iterrows(), 1):
+#             name = row.get('name', 'Unknown')
+#             rating = row.get('overall_rating', 'N/A')
+#             reviews = row.get('review_count', 'N/A')
+#             match = row.get('match_score', 0)
+#             pop_score = row.get('pop_score', 0)
+#             output += f"{idx}. **{name}**\n"
+#             output += f"   Rating: {rating} | Reviews: {reviews}\n"
+#             output += f"   Match Score: {match:.2f}"
+#             if use_popularity:
+#                 output += f" | Popularity: {pop_score:.2f}"
+#             output += "\n"
+#             if 'address' in row and pd.notna(row['address']):
+#                 addr = str(row['address'])[:100]
+#                 output += f"   Address: {addr}\n"
+#             output += "\n"
+#         return output, map_html
+#     except Exception as e:
+#         import traceback
+#         return f"Error: {str(e)}\n\n{traceback.format_exc()}", None
+# def search_restaurants(query, data_source, search_method, num_results, use_popularity):
+#     """Main search function that routes to appropriate search method"""
+#     if search_method == "Semantic Search" and use_semantic:
+#         return semantic_search(query, data_source, num_results, use_popularity)
+#     else:
+#         return keyword_search(query, data_source, num_results, use_popularity)
+def search_restaurants(query_input, data_source, num_results):
+    n_candidates = 100
+    query_clean = clean_text(query_input)
+    return get_recommendations(query_clean, n_candidates, num_results)
+# Create Gradio interface
+with gr.Blocks(title="Restaurant Finder", theme=gr.themes.Soft()) as app:
+    gr.Markdown("""
+    # Advanced Restaurant Recommendation System
+    ### Search Through 5,000+ Restaurants with AI-Powered Semantic Search
+    Find restaurants using semantic understanding and popularity ranking!
+    """)
+    with gr.Row():
+        with gr.Column(scale=3):
+            query_input = gr.Textbox(
+                label="Search Query",
+                placeholder="e.g., Italian pasta, best sushi, romantic dinner, family-friendly pizza",
+                lines=2
+            )
+        with gr.Column(scale=2):
+            data_source = gr.Dropdown(
+                choices=["Michelin", "Google", "Yelp"],
+                value="Yelp",
+                label="Data Source",
+                info="Select restaurant data source"
+            )
+    with gr.Row():
+        # with gr.Column(scale=2):
+        #     search_method = gr.Radio(
+        #         choices=["Keyword Search", "Semantic Search"],
+        #         value="Semantic Search" if use_semantic else "Keyword Search",
+        #         label="Search Method",
+        #         info="Semantic uses AI embeddings, Keyword uses exact matches"
+        #     )
+        with gr.Column(scale=1):
+            num_results = gr.Slider(
+                minimum=5,
+                maximum=30,
+                value=10,
+                step=5,
+                label="Results"
+            )
+        # with gr.Column(scale=1):
+        #     use_popularity = gr.Checkbox(
+        #         label="Use Popularity Ranking",
+        #         value=True,
+        #         info="Boost popular restaurants"
+        #     )
+    search_btn = gr.Button("Search Restaurants", variant="primary", size="lg")
+    with gr.Row():
+        with gr.Column(scale=1):
+            results_output = gr.Textbox(
+                label="Restaurant Results",
+                lines=20,
+                max_lines=30
+            )
+        with gr.Column(scale=1):
+            map_output = gr.HTML(
+                label="Paris Map"
+            )
+    gr.Markdown("### Try These Examples:")
+    examples = [
+        ["Italian pasta", "Yelp", 10],
+        ["sushi", "Michelin", 10],
+        ["romantic dinner", "Google", 8],
+        ["family-friendly pizza", "Yelp", 10],
+        ["best seafood", "Michelin", 10],
+        ["cheap burger", "Google", 10]
+    ]
+    gr.Examples(
+        examples=examples,
+        inputs=[query_input, data_source, num_results]
+    )
+    search_btn.click(
+        fn=search_restaurants,
+        inputs=[query_input, data_source, num_results],
+        outputs=[results_output, map_output]
+    )
+    query_input.submit(
+        fn=search_restaurants,
+        inputs=[query_input, data_source, num_results],
+        outputs=[results_output, map_output]
+    )
+if __name__ == "__main__":
+    print("\nStarting Advanced Restaurant Finder...")
+    print(f"{len(data)} restaurants ready to search")
+    print(f"Popularity Ranking: Enabled")
+    print("Opening at http://127.0.0.1:7860\n")
+    app.launch(share=False, server_name="127.0.0.1", server_port=7860, inbrowser=True)

data/tfidf_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32c417c211041c2ffadda4776cc7eaaa03d416920f5b4541b127fb6c816cc65a
+size 473929

main.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+import nltk
+import benepar
+import pandas as pd
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from utils.clean_text import clean_text
+from utils.semantic_similarity import Encoder
+from utils.syntactic_similarity import Parser
+from utils.tfidf_similarity import TFIDF_Vectorizer
+# Set default device to CUDA if available, otherwise CPU
+if torch.cuda.is_available():
+    torch.set_default_device("cuda")
+else:
+    torch.set_default_device("cpu")
+# Download models/data
+nltk.download('punkt')
+nltk.download('punkt_tab')
+benepar.download('benepar_en3_large')
+# Load dataset
+data = pd.read_csv("data/toy_data_aggregated_embeddings.csv")
+# Load precomputed TF-IDF features
+restaurant_tfidf_features = np.load("data/toy_data_tfidf_features.npz")
+# Extract embeddings
+all_desc_embeddings = np.vstack(data["embedding"].values)
+# Initialize encoder
+encoder = Encoder()
+# Initialize syntactic parser
+parser = Parser()
+# Initialize TF-IDF vectorizer
+tfidf_vectorizer = TFIDF_Vectorizer(load_vectorizer=True)
+def retrieve_candidates(query: str, n_candidates: int):
+    # Encode query
+    query_emb = encoder.encode([query]).cpu().numpy()
+    # Semantic similarities
+    desc_sem_sim = cosine_similarity(query_emb, all_desc_embeddings)[0]
+    # TF-IDF similarities
+    tfidf_sim = tfidf_vectorizer.compute_tfidf_scores(query, restaurant_tfidf_features)
+    # Syntactic similarities
+    parsed_query = parser.parse_text(query)
+    parsed_query = parser.subtree_set(parsed_query)
+    syn_sims = []
+    for trees_list in data["syntactic_tree"]:
+        review_sims = []
+        for review_tree_subs in trees_list:
+            if review_tree_subs is None:
+                review_tree_subs = set()
+            sim = parser.compute_syntactic_similarity(parsed_query, review_tree_subs)
+            review_sims.append(sim)
+        syn_sims.append(np.mean(review_sims))
+    # Combined Stage 1 score
+    syn_sims = np.array(syn_sims)
+    combined_stage1_scores = 0.8*desc_sem_sim + 0.1*syn_sims + 0.1*tfidf_sim
+    # Get top N candidates for Stage 2 reranking
+    candidates_idx = np.argsort(combined_stage1_scores)[-n_candidates:][::-1]
+    return candidates_idx
+def rerank(candidates_idx: np.ndarray, n_rec: int = 10, ) -> list:
+    # Get popularity scores for stage 1 candidates
+    rerank_scores = data.loc[candidates_idx, "pop_score"].values
+    # Retrieve n_rec restaurant based on pop_score
+    topN_reranked_local_idx = np.argsort(rerank_scores)[-n_rec:][::-1]
+    topN_reranked_global_idx = candidates_idx[topN_reranked_local_idx]
+    # Get restaurant_id for final recommendations
+    restaurant_ids = data.loc[topN_reranked_global_idx, "id"].tolist()
+    return restaurant_ids
+def get_recommendations(query: str, n_candidates: int = 100, n_rec: int = 30):
+    query_clean = clean_text(query)
+    candidates_idx = retrieve_candidates(query_clean, n_candidates)
+    restaurant_ids = rerank(candidates_idx, n_rec)
+    return restaurant_ids

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch>=2.0.0 --extra-index-url https://download.pytorch.org/whl/cpu
+numpy==1.25.2
+scipy==1.11.2
+pandas==2.1.1
+scikit-learn==1.3.0
+sentence-transformers
+nltk==3.8.1
+benepar==0.2.0
+tqdm==4.66.1
+folium

utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

utils/clean_text.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import re
+def clean_text(text) -> str:
+    # Strip and lower
+    text = text.strip().lower()
+    # Remove mentions (@username) and hashtags (#tag)
+    text = re.sub(r'[@#][\w∆]+', '', text)
+    # Remove extra spaces left behind
+    text = re.sub(r'\s+', ' ', text)
+    text = text.replace("\n", " ").replace("\t", " ")
+    # Remove phone numbers
+    text = re.sub(r'\b\d{10}\b', '', text)
+    # Collapse repeated punctuation (e.g. !!!!)
+    text = re.sub(r'([^\w\s])\1+', r'\1', text)
+    # Collapse multiple spaces
+    text = re.sub(r'\s+', ' ', text)
+    # Fix "\'" like: can\'t, don\'t, etc
+    text = re.sub(r"\\'", "'", text)
+    text = re.sub(r"\\'", "'", text)
+    return text.strip()

utils/ranker.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import numpy as np
+import pandas as pd
+def compute_bayesian_popularity_score(df, rating_col="overall_rating", reviews_col="review_count", m_prior=20):
+    # Convert to numeric
+    df[rating_col] = pd.to_numeric(df[rating_col], errors="coerce")
+    df[reviews_col] = pd.to_numeric(df[reviews_col], errors="coerce").fillna(0).astype(int)
+    # Global mean rating
+    mu = df[rating_col].dropna().mean()
+    # Data
+    n = df[reviews_col]
+    r = df[rating_col].fillna(mu)
+    # Bayesian rating
+    df["bayes_rating"] = ((mu * m_prior + n * r) / (m_prior + n.replace(0, np.nan))).fillna(mu)
+    # Popularity metrics
+    df["pop_log"] = np.log1p(n)
+    df["pop_score"] = 0.7 * df["bayes_rating"] + 0.3 * df["pop_log"]
+    return df

utils/semantic_similarity.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from typing import List
+from sentence_transformers import SentenceTransformer
+class Encoder():
+    def __init__(self):
+        print("Loading embedding model...")
+        self.model = SentenceTransformer(
+            "KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5",
+            model_kwargs={"attn_implementation": "sdpa"}
+        )
+        self.model = self.model.half()
+    def encode(
+            self,
+            texts: List[str],
+            batch_size: int = 8,
+            show_progress_bar: bool = False,
+            save_path: str = None):
+        embeddings = self.model.encode(texts, convert_to_tensor=True, show_progress_bar=show_progress_bar, batch_size=batch_size)
+        # if save_path:
+        #     torch.save(embeddings, save_path)
+        return embeddings

utils/syntactic_similarity.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import torch
+import pickle
+import benepar
+import nltk
+from nltk.tree import Tree
+import os
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor
+nltk.data.path.append('data/nltk_data')
+class Parser():
+    def __init__(self):
+        torch.set_default_device("cpu")
+        self.parser = benepar.Parser("benepar_en3_large")
+        self.parser.batch_size = 64
+        self.parsed_eval_reviews_path = "data/parsed/parsed_reviews.pkl"
+        self.parsed_toy_reviews_path = "data/parsed/parsed_toy_data_reviews.pkl"
+    def subtree_set(self, tree: Tree):
+        """
+        Return a flat set of all subtrees as strings (hashable).
+        """
+        subs = set()
+        def helper(t):
+            # Convert each subtree to a string and add to the set
+            subs.add(str(t))
+            for child in t:
+                if isinstance(child, Tree):
+                    helper(child)
+        helper(tree)
+        return subs
+    def parse_text(self, text):
+        try:
+            return self.parser.parse(text[:10000])  # truncate long reviews
+        except Exception as e:
+            print(f"Parse error: {e}")
+            return None
+    def parse_reviews(self, reviews: list, toy_data: bool) -> list[set]:
+        parsed_reviews = []
+        with ThreadPoolExecutor(max_workers=os.cpu_count()-1) as executor:
+            for tree in tqdm(executor.map(self.parse_text, reviews), total=len(reviews)):
+                if isinstance(tree, Tree):
+                    parsed_reviews.append(self.subtree_set(tree))
+                else:
+                    parsed_reviews.append(set())  # fallback for parse errors
+        # Save parsed reviews
+        with open(self.parsed_toy_reviews_path if toy_data else self.parsed_eval_reviews_path, "wb") as f:
+            pickle.dump(parsed_reviews, f)
+        return parsed_reviews
+    def compute_syntactic_similarity(self, query_tree_subs: set, review_tree_subs: set) -> float:
+        """
+        Jaccard similarity between two sets of subtrees (strings, hashable)
+        """
+        intersect = query_tree_subs.intersection(review_tree_subs)
+        union = query_tree_subs.union(review_tree_subs)
+        if not union:
+            return 0.0
+        return len(intersect) / len(union)
+    def load_parsed_reviews(self, toy_data: bool) -> list[set]:
+        path = self.parsed_toy_reviews_path if toy_data else self.parsed_eval_reviews_path
+        with open(path, "rb") as f:
+            parsed_reviews = pickle.load(f)
+        return parsed_reviews

utils/tfidf_similarity.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import pickle
+from scipy.sparse import save_npz
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.feature_extraction.text import TfidfVectorizer
+class TFIDF_Vectorizer():
+    def __init__(self, load_vectorizer=None, stop_words='english', min_df=2):
+        self.vectorizer_path = "tfidf_vectorizer.pkl"
+        self.tfidf_matrix_path = "tfidf_matrix.npz"
+        if load_vectorizer:
+            with open(self.vectorizer_path, 'rb') as file:
+                self.vectorizer = pickle.load(file)
+        else:
+            self.vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=min_df)
+    def compute_tfidf_matrix(self, texts):
+        features = self.vectorizer.fit_transform(texts)
+        # save vectorizer
+        with open(self.vectorizer_path, 'wb') as file:
+            pickle.dump(self.vectorizer, file)
+        # save tfidf matrix
+        save_npz(self.tfidf_matrix_path, features)
+        return features
+    def transform(self, texts: list) -> any:
+        return self.vectorizer.transform(texts)
+    def compute_tfidf_scores(self, query: str, restaurant_tfidf_features: any) -> list:
+        query_tfidf_features = self.vectorizer.transform([query])
+        return cosine_similarity(query_tfidf_features, restaurant_tfidf_features)[0]