Spaces:
Sleeping
Sleeping
| """ | |
| Script to generate example/simulated data files for each task type. | |
| Run this script to create sample data files in the data/examples directory. | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| import json | |
| import os | |
| # Set random seed for reproducibility | |
| np.random.seed(42) | |
| # Create output directory | |
| OUTPUT_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "examples") | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| def generate_classification_data(): | |
| """Generate binary classification example data.""" | |
| n_samples = 500 | |
| # Simulate disease presence (30% positive rate) | |
| y_true = np.random.binomial(1, 0.3, n_samples) | |
| # Simulate model predictions with reasonable AUC (~0.85) | |
| y_pred = np.zeros(n_samples) | |
| y_pred[y_true == 1] = np.random.beta(5, 2, sum(y_true == 1)) # Higher scores for positives | |
| y_pred[y_true == 0] = np.random.beta(2, 5, sum(y_true == 0)) # Lower scores for negatives | |
| df = pd.DataFrame({ | |
| 'y_true': y_true, | |
| 'y_pred': y_pred | |
| }) | |
| filepath = os.path.join(OUTPUT_DIR, "classification_example.csv") | |
| df.to_csv(filepath, index=False) | |
| print(f"Created: {filepath}") | |
| print(f" - Samples: {n_samples}") | |
| print(f" - Positive rate: {y_true.mean():.1%}") | |
| return filepath | |
| def generate_regression_data(): | |
| """Generate regression example data (e.g., tumor size prediction).""" | |
| n_samples = 300 | |
| # Simulate true tumor sizes (mm) - log-normal distribution | |
| y_true = np.random.lognormal(mean=2.5, sigma=0.5, size=n_samples) | |
| y_true = np.clip(y_true, 5, 100) # Realistic range: 5-100mm | |
| # Simulate predictions with some error | |
| noise = np.random.normal(0, 3, n_samples) # ~3mm average error | |
| y_pred = y_true + noise | |
| y_pred = np.clip(y_pred, 0, 120) # Keep predictions reasonable | |
| df = pd.DataFrame({ | |
| 'y_true': np.round(y_true, 2), | |
| 'y_pred': np.round(y_pred, 2) | |
| }) | |
| filepath = os.path.join(OUTPUT_DIR, "regression_example.csv") | |
| df.to_csv(filepath, index=False) | |
| print(f"Created: {filepath}") | |
| print(f" - Samples: {n_samples}") | |
| print(f" - True value range: [{y_true.min():.1f}, {y_true.max():.1f}]") | |
| return filepath | |
| def generate_segmentation_data_2d(): | |
| """Generate 2D segmentation example data (e.g., lung nodule segmentation).""" | |
| height, width = 256, 256 | |
| # Create ground truth mask with a circular lesion | |
| y_true = np.zeros((height, width), dtype=np.uint8) | |
| center_y, center_x = 128, 128 | |
| radius = 40 | |
| y, x = np.ogrid[:height, :width] | |
| mask = (x - center_x)**2 + (y - center_y)**2 <= radius**2 | |
| y_true[mask] = 1 | |
| # Create prediction with slight offset and size difference (simulating model output) | |
| y_pred = np.zeros((height, width), dtype=np.uint8) | |
| pred_center_y, pred_center_x = 130, 132 # Slight offset | |
| pred_radius = 38 # Slightly smaller | |
| mask_pred = (x - pred_center_x)**2 + (y - pred_center_y)**2 <= pred_radius**2 | |
| y_pred[mask_pred] = 1 | |
| # Add some noise to prediction (small FP regions) | |
| noise_mask = np.random.random((height, width)) < 0.001 | |
| y_pred[noise_mask] = 1 | |
| gt_path = os.path.join(OUTPUT_DIR, "segmentation_2d_ground_truth.npy") | |
| pred_path = os.path.join(OUTPUT_DIR, "segmentation_2d_prediction.npy") | |
| np.save(gt_path, y_true) | |
| np.save(pred_path, y_pred) | |
| print(f"Created: {gt_path}") | |
| print(f"Created: {pred_path}") | |
| print(f" - Shape: {y_true.shape}") | |
| print(f" - GT pixels: {y_true.sum()}, Pred pixels: {y_pred.sum()}") | |
| return gt_path, pred_path | |
| def generate_segmentation_data_3d(): | |
| """Generate 3D segmentation example data (e.g., liver segmentation from CT).""" | |
| depth, height, width = 32, 128, 128 | |
| # Create ground truth mask with an ellipsoid organ | |
| y_true = np.zeros((depth, height, width), dtype=np.uint8) | |
| center_z, center_y, center_x = 16, 64, 64 | |
| radius_z, radius_y, radius_x = 10, 30, 35 | |
| z, y, x = np.ogrid[:depth, :height, :width] | |
| mask = ((x - center_x)/radius_x)**2 + ((y - center_y)/radius_y)**2 + ((z - center_z)/radius_z)**2 <= 1 | |
| y_true[mask] = 1 | |
| # Create prediction with slight differences | |
| y_pred = np.zeros((depth, height, width), dtype=np.uint8) | |
| pred_center_z, pred_center_y, pred_center_x = 16, 65, 63 | |
| pred_radius_z, pred_radius_y, pred_radius_x = 9, 28, 33 | |
| mask_pred = ((x - pred_center_x)/pred_radius_x)**2 + ((y - pred_center_y)/pred_radius_y)**2 + ((z - pred_center_z)/pred_radius_z)**2 <= 1 | |
| y_pred[mask_pred] = 1 | |
| gt_path = os.path.join(OUTPUT_DIR, "segmentation_3d_ground_truth.npy") | |
| pred_path = os.path.join(OUTPUT_DIR, "segmentation_3d_prediction.npy") | |
| np.save(gt_path, y_true) | |
| np.save(pred_path, y_pred) | |
| print(f"Created: {gt_path}") | |
| print(f"Created: {pred_path}") | |
| print(f" - Shape: {y_true.shape}") | |
| print(f" - GT voxels: {y_true.sum()}, Pred voxels: {y_pred.sum()}") | |
| return gt_path, pred_path | |
| def generate_multiclass_segmentation_data(): | |
| """Generate multi-class segmentation data (e.g., brain tumor segmentation).""" | |
| height, width = 256, 256 | |
| # Create ground truth with multiple classes | |
| # 0 = background, 1 = tumor core, 2 = edema, 3 = enhancing tumor | |
| y_true = np.zeros((height, width), dtype=np.uint8) | |
| y, x = np.ogrid[:height, :width] | |
| # Edema (largest region) - class 2 | |
| edema_mask = (x - 128)**2 + (y - 128)**2 <= 50**2 | |
| y_true[edema_mask] = 2 | |
| # Tumor core - class 1 | |
| core_mask = (x - 128)**2 + (y - 128)**2 <= 30**2 | |
| y_true[core_mask] = 1 | |
| # Enhancing tumor (innermost) - class 3 | |
| enhancing_mask = (x - 128)**2 + (y - 128)**2 <= 15**2 | |
| y_true[enhancing_mask] = 3 | |
| # Create prediction with some errors | |
| y_pred = np.zeros((height, width), dtype=np.uint8) | |
| # Slightly different boundaries | |
| edema_mask_pred = (x - 130)**2 + (y - 126)**2 <= 48**2 | |
| y_pred[edema_mask_pred] = 2 | |
| core_mask_pred = (x - 130)**2 + (y - 126)**2 <= 28**2 | |
| y_pred[core_mask_pred] = 1 | |
| enhancing_mask_pred = (x - 130)**2 + (y - 126)**2 <= 14**2 | |
| y_pred[enhancing_mask_pred] = 3 | |
| gt_path = os.path.join(OUTPUT_DIR, "segmentation_multiclass_ground_truth.npy") | |
| pred_path = os.path.join(OUTPUT_DIR, "segmentation_multiclass_prediction.npy") | |
| np.save(gt_path, y_true) | |
| np.save(pred_path, y_pred) | |
| print(f"Created: {gt_path}") | |
| print(f"Created: {pred_path}") | |
| print(f" - Shape: {y_true.shape}") | |
| print(f" - Classes: 0=background, 1=tumor core, 2=edema, 3=enhancing") | |
| return gt_path, pred_path | |
| def generate_detection_data(): | |
| """Generate object detection example data (e.g., lung nodule detection).""" | |
| n_images = 50 | |
| predictions = [] | |
| ground_truths = [] | |
| for img_idx in range(n_images): | |
| # Random number of ground truth nodules (0-4 per image) | |
| n_nodules = np.random.choice([0, 1, 1, 2, 2, 3, 4], p=[0.1, 0.25, 0.25, 0.2, 0.1, 0.07, 0.03]) | |
| img_gt = [] | |
| img_pred = [] | |
| for _ in range(n_nodules): | |
| # Generate ground truth box | |
| x1 = int(np.random.randint(50, 400)) | |
| y1 = int(np.random.randint(50, 400)) | |
| size = int(np.random.randint(20, 80)) | |
| gt_box = [x1, y1, x1 + size, y1 + size] | |
| img_gt.append(gt_box) | |
| # 80% chance of detecting this nodule | |
| if np.random.random() < 0.8: | |
| # Add some localization error | |
| offset = np.random.randint(-8, 8, 4) | |
| pred_box = [ | |
| int(max(0, gt_box[0] + offset[0])), | |
| int(max(0, gt_box[1] + offset[1])), | |
| int(gt_box[2] + offset[2]), | |
| int(gt_box[3] + offset[3]) | |
| ] | |
| score = float(np.random.uniform(0.5, 0.98)) | |
| img_pred.append({"box": pred_box, "score": round(score, 3)}) | |
| # Add some false positives (0-2 per image) | |
| n_fp = np.random.choice([0, 0, 1, 1, 2], p=[0.4, 0.2, 0.2, 0.15, 0.05]) | |
| for _ in range(n_fp): | |
| x1 = int(np.random.randint(50, 400)) | |
| y1 = int(np.random.randint(50, 400)) | |
| size = int(np.random.randint(15, 50)) | |
| fp_box = [x1, y1, x1 + size, y1 + size] | |
| score = float(np.random.uniform(0.3, 0.7)) # Lower confidence for FPs | |
| img_pred.append({"box": fp_box, "score": round(score, 3)}) | |
| predictions.append(img_pred) | |
| ground_truths.append(img_gt) | |
| data = { | |
| "predictions": predictions, | |
| "ground_truths": ground_truths, | |
| "metadata": { | |
| "description": "Simulated lung nodule detection data", | |
| "n_images": n_images, | |
| "box_format": "[x1, y1, x2, y2]" | |
| } | |
| } | |
| filepath = os.path.join(OUTPUT_DIR, "detection_example.json") | |
| with open(filepath, 'w') as f: | |
| json.dump(data, f, indent=2) | |
| total_gt = sum(len(gt) for gt in ground_truths) | |
| total_pred = sum(len(pred) for pred in predictions) | |
| print(f"Created: {filepath}") | |
| print(f" - Images: {n_images}") | |
| print(f" - Total ground truth boxes: {total_gt}") | |
| print(f" - Total predictions: {total_pred}") | |
| return filepath | |
| def main(): | |
| print("=" * 60) | |
| print("Generating example data files for Omnibin") | |
| print("=" * 60) | |
| print() | |
| print("1. Binary Classification Data") | |
| print("-" * 40) | |
| generate_classification_data() | |
| print() | |
| print("2. Regression Data") | |
| print("-" * 40) | |
| generate_regression_data() | |
| print() | |
| print("3. 2D Segmentation Data") | |
| print("-" * 40) | |
| generate_segmentation_data_2d() | |
| print() | |
| print("4. 3D Segmentation Data") | |
| print("-" * 40) | |
| generate_segmentation_data_3d() | |
| print() | |
| print("5. Multi-class Segmentation Data") | |
| print("-" * 40) | |
| generate_multiclass_segmentation_data() | |
| print() | |
| print("6. Object Detection Data") | |
| print("-" * 40) | |
| generate_detection_data() | |
| print() | |
| print("=" * 60) | |
| print("All example data files generated successfully!") | |
| print(f"Location: {OUTPUT_DIR}") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() | |