personalisation-engine / scripts /1b_generate_semantic_data.py
nice-bill's picture
Deploy personalization engine
7964128
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import json
import torch
from sentence_transformers import SentenceTransformer
import random
import faiss
NUM_USERS = 10000
MIN_SEQUENCE_LENGTH = 5
MAX_SEQUENCE_LENGTH = 50
DATA_DIR = Path("data")
CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
OUTPUT_DIR = DATA_DIR / "synthetic"
MODEL_NAME = "all-MiniLM-L6-v2"
def main():
print("Loading catalog...")
df = pd.read_csv(CATALOG_PATH)
df['rich_content'] = (
"Title: " + df['title'].fillna("") +
"; Author: " + df['authors'].fillna("Unknown") +
"; Genres: " + df['genres'].fillna("") +
"; Description: " + df['description'].fillna("").astype(str).str.slice(0, 300)
)
titles = df['title'].tolist()
content_to_encode = df['rich_content'].tolist()
EMBEDDINGS_CACHE = DATA_DIR / "embeddings_cache.npy"
if EMBEDDINGS_CACHE.exists():
print(f"Loading cached embeddings from {EMBEDDINGS_CACHE}...")
emb_np = np.load(EMBEDDINGS_CACHE)
print("Embeddings loaded.")
else:
print(f"Loading Teacher Model ({MODEL_NAME})...")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(MODEL_NAME, device=device)
print("Encoding books (Title + Author + Genre + Desc)...")
embeddings = model.encode(content_to_encode, show_progress_bar=True, convert_to_tensor=True)
emb_np = embeddings.cpu().numpy()
print(f"Saving embeddings to {EMBEDDINGS_CACHE}...")
np.save(EMBEDDINGS_CACHE, emb_np)
print(f"Generating {NUM_USERS} semantic user journeys...")
cpu_index = faiss.IndexFlatIP(emb_np.shape[1])
faiss.normalize_L2(emb_np)
cpu_index.add(emb_np)
users = []
for user_id in tqdm(range(NUM_USERS)):
sequence = []
num_interests = random.choice([1, 1, 2, 3])
for _ in range(num_interests):
anchor_idx = random.randint(0, len(titles) - 1)
k_neighbors = 50
q = emb_np[anchor_idx].reshape(1, -1)
_, indices = cpu_index.search(q, k_neighbors)
neighbors_indices = indices[0]
num_to_read = random.randint(5, 15)
read_indices = np.random.choice(neighbors_indices, size=min(len(neighbors_indices), num_to_read), replace=False)
for idx in read_indices:
sequence.append(titles[idx])
if len(sequence) > MAX_SEQUENCE_LENGTH:
sequence = sequence[:MAX_SEQUENCE_LENGTH]
if len(sequence) >= MIN_SEQUENCE_LENGTH:
users.append({
'user_id': user_id,
'book_sequence': sequence,
'sequence_length': len(sequence),
'persona': 'semantic_explorer',
'metadata': {'generated': True}
})
users_df = pd.DataFrame(users)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
output_path = OUTPUT_DIR / "user_sequences.parquet"
users_df.to_parquet(output_path, index=False)
stats = {
'num_users': len(users_df),
'avg_sequence_length': float(users_df['sequence_length'].mean()),
'generated_via': "semantic_clustering"
}
with open(OUTPUT_DIR / "user_metadata.json", 'w') as f:
json.dump(stats, f, indent=2)
print(f"\n Generated {len(users_df)} semantic users")
print(f" Output: {output_path}")
if __name__ == "__main__":
main()