| """ |
| Data Generation Utilities |
| ======================== |
| |
| Utility functions for generating sample datasets. |
| """ |
|
|
| import pandas as pd |
| import numpy as np |
|
|
| def create_sample_mall_customers(n_customers=200, random_seed=42): |
| """ |
| Create a realistic sample Mall Customers dataset. |
| |
| Parameters: |
| ----------- |
| n_customers : int, default=200 |
| Number of customers to generate |
| random_seed : int, default=42 |
| Random seed for reproducibility |
| |
| Returns: |
| -------- |
| pd.DataFrame |
| Generated customer dataset |
| """ |
| np.random.seed(random_seed) |
| |
| customer_ids = range(1, n_customers + 1) |
| |
| |
| genders = np.random.choice(['Male', 'Female'], n_customers, p=[0.44, 0.56]) |
| |
| |
| ages = np.random.normal(38.85, 13.97, n_customers).astype(int) |
| ages = np.clip(ages, 18, 70) |
| |
| |
| annual_incomes = np.random.normal(60.56, 26.26, n_customers) |
| annual_incomes = np.clip(annual_incomes, 15, 137) |
| |
| |
| base_spending = np.random.normal(50, 25, n_customers) |
| |
| |
| income_normalized = (annual_incomes - annual_incomes.min()) / (annual_incomes.max() - annual_incomes.min()) |
| income_effect = (income_normalized - 0.5) * 30 |
| |
| |
| age_normalized = (ages - ages.min()) / (ages.max() - ages.min()) |
| age_effect = np.where(age_normalized < 0.3, 10, |
| np.where(age_normalized > 0.7, -5, 0)) |
| |
| |
| gender_effect = np.where(genders == 'Female', 3, -3) |
| |
| spending_scores = (base_spending + |
| income_effect * 0.6 + |
| age_effect + |
| gender_effect + |
| np.random.normal(0, 10, n_customers)) |
| spending_scores = np.clip(spending_scores, 1, 100) |
| |
| |
| sample_data = pd.DataFrame({ |
| 'CustomerID': customer_ids, |
| 'Gender': genders, |
| 'Age': ages, |
| 'Annual Income (k$)': annual_incomes.round().astype(int), |
| 'Spending Score (1-100)': spending_scores.round().astype(int) |
| }) |
| |
| return sample_data |
|
|