| import pandas as pd | |
| import numpy as np | |
| from pathlib import Path | |
| from tqdm import tqdm | |
| import json | |
| import torch | |
| from sentence_transformers import SentenceTransformer | |
| import random | |
| import faiss | |
| NUM_USERS = 10000 | |
| MIN_SEQUENCE_LENGTH = 5 | |
| MAX_SEQUENCE_LENGTH = 50 | |
| DATA_DIR = Path("data") | |
| CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv" | |
| OUTPUT_DIR = DATA_DIR / "synthetic" | |
| MODEL_NAME = "all-MiniLM-L6-v2" | |
| def main(): | |
| print("Loading catalog...") | |
| df = pd.read_csv(CATALOG_PATH) | |
| df['rich_content'] = ( | |
| "Title: " + df['title'].fillna("") + | |
| "; Author: " + df['authors'].fillna("Unknown") + | |
| "; Genres: " + df['genres'].fillna("") + | |
| "; Description: " + df['description'].fillna("").astype(str).str.slice(0, 300) | |
| ) | |
| titles = df['title'].tolist() | |
| content_to_encode = df['rich_content'].tolist() | |
| EMBEDDINGS_CACHE = DATA_DIR / "embeddings_cache.npy" | |
| if EMBEDDINGS_CACHE.exists(): | |
| print(f"Loading cached embeddings from {EMBEDDINGS_CACHE}...") | |
| emb_np = np.load(EMBEDDINGS_CACHE) | |
| print("Embeddings loaded.") | |
| else: | |
| print(f"Loading Teacher Model ({MODEL_NAME})...") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = SentenceTransformer(MODEL_NAME, device=device) | |
| print("Encoding books (Title + Author + Genre + Desc)...") | |
| embeddings = model.encode(content_to_encode, show_progress_bar=True, convert_to_tensor=True) | |
| emb_np = embeddings.cpu().numpy() | |
| print(f"Saving embeddings to {EMBEDDINGS_CACHE}...") | |
| np.save(EMBEDDINGS_CACHE, emb_np) | |
| print(f"Generating {NUM_USERS} semantic user journeys...") | |
| cpu_index = faiss.IndexFlatIP(emb_np.shape[1]) | |
| faiss.normalize_L2(emb_np) | |
| cpu_index.add(emb_np) | |
| users = [] | |
| for user_id in tqdm(range(NUM_USERS)): | |
| sequence = [] | |
| num_interests = random.choice([1, 1, 2, 3]) | |
| for _ in range(num_interests): | |
| anchor_idx = random.randint(0, len(titles) - 1) | |
| k_neighbors = 50 | |
| q = emb_np[anchor_idx].reshape(1, -1) | |
| _, indices = cpu_index.search(q, k_neighbors) | |
| neighbors_indices = indices[0] | |
| num_to_read = random.randint(5, 15) | |
| read_indices = np.random.choice(neighbors_indices, size=min(len(neighbors_indices), num_to_read), replace=False) | |
| for idx in read_indices: | |
| sequence.append(titles[idx]) | |
| if len(sequence) > MAX_SEQUENCE_LENGTH: | |
| sequence = sequence[:MAX_SEQUENCE_LENGTH] | |
| if len(sequence) >= MIN_SEQUENCE_LENGTH: | |
| users.append({ | |
| 'user_id': user_id, | |
| 'book_sequence': sequence, | |
| 'sequence_length': len(sequence), | |
| 'persona': 'semantic_explorer', | |
| 'metadata': {'generated': True} | |
| }) | |
| users_df = pd.DataFrame(users) | |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| output_path = OUTPUT_DIR / "user_sequences.parquet" | |
| users_df.to_parquet(output_path, index=False) | |
| stats = { | |
| 'num_users': len(users_df), | |
| 'avg_sequence_length': float(users_df['sequence_length'].mean()), | |
| 'generated_via': "semantic_clustering" | |
| } | |
| with open(OUTPUT_DIR / "user_metadata.json", 'w') as f: | |
| json.dump(stats, f, indent=2) | |
| print(f"\n Generated {len(users_df)} semantic users") | |
| print(f" Output: {output_path}") | |
| if __name__ == "__main__": | |
| main() |