Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import logging | |
| from pathlib import Path | |
| from tqdm import tqdm | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger("Visualizer") | |
| DATA_DIR = Path("data") | |
| SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet" | |
| CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv" | |
| EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy" | |
| OUTPUT_DIR = Path("docs") | |
| OUTPUT_IMAGE = OUTPUT_DIR / "user_clusters_tsne.png" | |
| def visualize_clusters(sample_size=2000): | |
| """ | |
| Generates a 2D t-SNE projection of user vectors, colored by Persona. | |
| """ | |
| try: | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.manifold import TSNE | |
| except ImportError as e: | |
| logger.error("Missing visualization libraries!") | |
| logger.error("Please run: uv pip install matplotlib seaborn") | |
| return | |
| logger.info("Loading Data...") | |
| if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists(): | |
| logger.error("Missing Data! Run download scripts first.") | |
| return | |
| df_catalog = pd.read_csv(CATALOG_PATH) | |
| titles = df_catalog['title'].tolist() | |
| title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)} | |
| embeddings = np.load(EMBEDDINGS_PATH) | |
| df_users = pd.read_parquet(SYNTHETIC_DATA_PATH) | |
| if len(df_users) > sample_size: | |
| df_users = df_users.sample(sample_size, random_state=42) | |
| logger.info(f"Processing {len(df_users)} users...") | |
| user_vectors = [] | |
| user_personas = [] | |
| valid_users = 0 | |
| for _, row in tqdm(df_users.iterrows(), total=len(df_users)): | |
| history = row['book_sequence'] | |
| persona = row['persona'] | |
| valid_indices = [] | |
| for book in history: | |
| norm_title = book.lower().strip() | |
| if norm_title in title_to_idx: | |
| valid_indices.append(title_to_idx[norm_title]) | |
| if not valid_indices: | |
| continue | |
| vectors = embeddings[valid_indices] | |
| user_vec = np.mean(vectors, axis=0) | |
| user_vectors.append(user_vec) | |
| user_personas.append(persona) | |
| valid_users += 1 | |
| X = np.array(user_vectors) | |
| logger.info("Running t-SNE") | |
| tsne = TSNE(n_components=2, random_state=42, perplexity=30) | |
| X_embedded = tsne.fit_transform(X) | |
| logger.info("Generating Plot...") | |
| OUTPUT_DIR.mkdir(exist_ok=True) | |
| plt.figure(figsize=(12, 8)) | |
| sns.scatterplot( | |
| x=X_embedded[:, 0], | |
| y=X_embedded[:, 1], | |
| hue=user_personas, | |
| palette="viridis", | |
| alpha=0.7, | |
| s=60 | |
| ) | |
| plt.title(f"Semantic User Clusters (t-SNE Projection of {valid_users} Users)", fontsize=16) | |
| plt.xlabel("Dimension 1") | |
| plt.ylabel("Dimension 2") | |
| plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title="Persona") | |
| plt.tight_layout() | |
| plt.savefig(OUTPUT_IMAGE, dpi=300) | |
| logger.info(f"Visualization saved to {OUTPUT_IMAGE}") | |
| print(f"Success! Check {OUTPUT_IMAGE} to see your user clusters.") | |
| if __name__ == "__main__": | |
| visualize_clusters() | |