Spaces:
Running
Running
| import logging | |
| import os | |
| import pickle | |
| import sys | |
| import numpy as np | |
| import pandas as pd | |
| # Add project root | |
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) | |
| import src.book_recommender.core.config as config | |
| from src.book_recommender.ml.clustering import cluster_books, get_cluster_names | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger(__name__) | |
| def precompute_clusters(): | |
| logger.info("--- Starting Cluster Pre-computation ---") | |
| # 1. Load Data | |
| if not os.path.exists(config.PROCESSED_DATA_PATH) or not os.path.exists(config.EMBEDDINGS_PATH): | |
| logger.error("Data files missing. Run data processor and embedder first.") | |
| return | |
| logger.info(f"Loading book data from {config.PROCESSED_DATA_PATH}...") | |
| book_data_df = pd.read_parquet(config.PROCESSED_DATA_PATH) | |
| logger.info(f"Loading embeddings from {config.EMBEDDINGS_PATH}...") | |
| embeddings_arr = np.load(config.EMBEDDINGS_PATH) | |
| # 2. Cluster | |
| n_clusters = config.NUM_CLUSTERS | |
| logger.info(f"Clustering {len(book_data_df)} books into {n_clusters} clusters...") | |
| clusters_arr, _ = cluster_books(embeddings_arr, n_clusters=n_clusters) | |
| # 3. Name Clusters | |
| book_data_df["cluster_id"] = clusters_arr | |
| names = get_cluster_names(book_data_df, clusters_arr) | |
| # 4. Save Cache | |
| cache_path = config.PROCESSED_DATA_DIR / "cluster_cache.pkl" | |
| logger.info(f"Saving cache to {cache_path}...") | |
| try: | |
| with open(cache_path, "wb") as f: | |
| # Must match the tuple structure expected by api/dependencies.py | |
| # (clusters_arr, names, book_data_df) | |
| pickle.dump((clusters_arr, names, book_data_df), f) | |
| logger.info("Successfully pre-computed and cached clusters.") | |
| except Exception as e: | |
| logger.error(f"Failed to save cache: {e}") | |
| if __name__ == "__main__": | |
| precompute_clusters() | |