Faaz commited on
Commit Β·
59c6c97
1
Parent(s): 11e0d89
Day 2 COMPLETE: 1.48M examples processed, 6GB dataset, WebSight done
Browse files- scripts/download_datasets.py +891 -0
- scripts/process_data.py +820 -0
scripts/download_datasets.py
ADDED
|
@@ -0,0 +1,891 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MINDI 1.5 Vision-Coder β Day 2 Step 1: Dataset Download Pipeline
|
| 3 |
+
|
| 4 |
+
Downloads 7 datasets (500K+ examples total) with:
|
| 5 |
+
- Rich progress bars
|
| 6 |
+
- Network retry with exponential backoff
|
| 7 |
+
- Checkpoint/resume support
|
| 8 |
+
- Disk space estimation
|
| 9 |
+
- Logging to logs/download.log
|
| 10 |
+
- Running total of examples
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
python scripts/download_datasets.py # Download all
|
| 14 |
+
python scripts/download_datasets.py --dataset websight # Download one
|
| 15 |
+
python scripts/download_datasets.py --stage 1 # Stage 1 only (small/fast)
|
| 16 |
+
python scripts/download_datasets.py --stage 2 # Stage 2 (starcoder)
|
| 17 |
+
python scripts/download_datasets.py --stage 3 # Stage 3 (websight)
|
| 18 |
+
python scripts/download_datasets.py --synthetic # Synthetic only
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import argparse
|
| 24 |
+
import hashlib
|
| 25 |
+
import json
|
| 26 |
+
import logging
|
| 27 |
+
import os
|
| 28 |
+
import random
|
| 29 |
+
import sys
|
| 30 |
+
import time
|
| 31 |
+
import traceback
|
| 32 |
+
from dataclasses import dataclass, field
|
| 33 |
+
from pathlib import Path
|
| 34 |
+
from typing import Any, Generator, Optional
|
| 35 |
+
|
| 36 |
+
from rich.console import Console
|
| 37 |
+
from rich.logging import RichHandler
|
| 38 |
+
from rich.panel import Panel
|
| 39 |
+
from rich.progress import (
|
| 40 |
+
BarColumn,
|
| 41 |
+
MofNCompleteColumn,
|
| 42 |
+
Progress,
|
| 43 |
+
SpinnerColumn,
|
| 44 |
+
TextColumn,
|
| 45 |
+
TimeElapsedColumn,
|
| 46 |
+
TimeRemainingColumn,
|
| 47 |
+
)
|
| 48 |
+
from rich.table import Table
|
| 49 |
+
|
| 50 |
+
# ββ Project paths βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 52 |
+
DATA_RAW = PROJECT_ROOT / "data" / "raw"
|
| 53 |
+
LOGS_DIR = PROJECT_ROOT / "logs"
|
| 54 |
+
CHECKPOINT_FILE = DATA_RAW / ".download_checkpoint.json"
|
| 55 |
+
|
| 56 |
+
DATA_RAW.mkdir(parents=True, exist_ok=True)
|
| 57 |
+
LOGS_DIR.mkdir(parents=True, exist_ok=True)
|
| 58 |
+
|
| 59 |
+
# ββ Logging βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 60 |
+
console = Console()
|
| 61 |
+
|
| 62 |
+
logging.basicConfig(
|
| 63 |
+
level=logging.INFO,
|
| 64 |
+
format="%(message)s",
|
| 65 |
+
datefmt="[%X]",
|
| 66 |
+
handlers=[
|
| 67 |
+
RichHandler(console=console, rich_tracebacks=True, show_path=False),
|
| 68 |
+
logging.FileHandler(LOGS_DIR / "download.log", encoding="utf-8"),
|
| 69 |
+
],
|
| 70 |
+
)
|
| 71 |
+
log = logging.getLogger("mindi.download")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# ββ Checkpoint manager ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 75 |
+
class CheckpointManager:
|
| 76 |
+
"""Tracks which datasets are complete so downloads can resume."""
|
| 77 |
+
|
| 78 |
+
def __init__(self, path: Path = CHECKPOINT_FILE) -> None:
|
| 79 |
+
self.path = path
|
| 80 |
+
self.data: dict[str, Any] = self._load()
|
| 81 |
+
|
| 82 |
+
def _load(self) -> dict[str, Any]:
|
| 83 |
+
if self.path.exists():
|
| 84 |
+
return json.loads(self.path.read_text(encoding="utf-8"))
|
| 85 |
+
return {"completed": {}, "in_progress": {}}
|
| 86 |
+
|
| 87 |
+
def save(self) -> None:
|
| 88 |
+
self.path.write_text(json.dumps(self.data, indent=2), encoding="utf-8")
|
| 89 |
+
|
| 90 |
+
def is_complete(self, name: str) -> bool:
|
| 91 |
+
return name in self.data["completed"]
|
| 92 |
+
|
| 93 |
+
def mark_complete(self, name: str, count: int, size_mb: float) -> None:
|
| 94 |
+
self.data["completed"][name] = {
|
| 95 |
+
"count": count,
|
| 96 |
+
"size_mb": round(size_mb, 2),
|
| 97 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 98 |
+
}
|
| 99 |
+
self.data["in_progress"].pop(name, None)
|
| 100 |
+
self.save()
|
| 101 |
+
|
| 102 |
+
def mark_in_progress(self, name: str, count: int) -> None:
|
| 103 |
+
self.data["in_progress"][name] = {"count": count}
|
| 104 |
+
self.save()
|
| 105 |
+
|
| 106 |
+
def get_resume_count(self, name: str) -> int:
|
| 107 |
+
return self.data.get("in_progress", {}).get(name, {}).get("count", 0)
|
| 108 |
+
|
| 109 |
+
def get_total_examples(self) -> int:
|
| 110 |
+
return sum(v["count"] for v in self.data["completed"].values())
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# ββ Dataset definitions βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 114 |
+
@dataclass
|
| 115 |
+
class DatasetConfig:
|
| 116 |
+
name: str
|
| 117 |
+
hf_name: str
|
| 118 |
+
hf_subset: Optional[str]
|
| 119 |
+
hf_split: str
|
| 120 |
+
target_count: int
|
| 121 |
+
output_file: str
|
| 122 |
+
stage: int
|
| 123 |
+
est_size_gb: float
|
| 124 |
+
description: str
|
| 125 |
+
languages: list[str] = field(default_factory=list)
|
| 126 |
+
is_synthetic: bool = False
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
DATASETS: list[DatasetConfig] = [
|
| 130 |
+
# Stage 1 β Small/fast (5-10 min)
|
| 131 |
+
DatasetConfig(
|
| 132 |
+
name="codealpaca",
|
| 133 |
+
hf_name="sahil2801/CodeAlpaca-20k",
|
| 134 |
+
hf_subset=None,
|
| 135 |
+
hf_split="train",
|
| 136 |
+
target_count=20_000,
|
| 137 |
+
output_file="codealpaca.jsonl",
|
| 138 |
+
stage=1,
|
| 139 |
+
est_size_gb=0.05,
|
| 140 |
+
description="Code instruction-following pairs",
|
| 141 |
+
),
|
| 142 |
+
DatasetConfig(
|
| 143 |
+
name="codefeedback",
|
| 144 |
+
hf_name="m-a-p/CodeFeedback-Filtered-Instruction",
|
| 145 |
+
hf_subset=None,
|
| 146 |
+
hf_split="train",
|
| 147 |
+
target_count=50_000,
|
| 148 |
+
output_file="codefeedback.jsonl",
|
| 149 |
+
stage=1,
|
| 150 |
+
est_size_gb=0.3,
|
| 151 |
+
description="Code with human feedback",
|
| 152 |
+
),
|
| 153 |
+
# Stage 2 β Medium (1-2 hours)
|
| 154 |
+
DatasetConfig(
|
| 155 |
+
name="starcoder_python",
|
| 156 |
+
hf_name="bigcode/starcoderdata",
|
| 157 |
+
hf_subset="python",
|
| 158 |
+
hf_split="train",
|
| 159 |
+
target_count=100_000,
|
| 160 |
+
output_file="starcoderdata.jsonl",
|
| 161 |
+
stage=2,
|
| 162 |
+
est_size_gb=2.0,
|
| 163 |
+
description="StarCoder Python code",
|
| 164 |
+
languages=["python"],
|
| 165 |
+
),
|
| 166 |
+
DatasetConfig(
|
| 167 |
+
name="starcoder_javascript",
|
| 168 |
+
hf_name="bigcode/starcoderdata",
|
| 169 |
+
hf_subset="javascript",
|
| 170 |
+
hf_split="train",
|
| 171 |
+
target_count=100_000,
|
| 172 |
+
output_file="starcoderdata.jsonl", # appends to same file
|
| 173 |
+
stage=2,
|
| 174 |
+
est_size_gb=2.0,
|
| 175 |
+
description="StarCoder JavaScript code",
|
| 176 |
+
languages=["javascript"],
|
| 177 |
+
),
|
| 178 |
+
DatasetConfig(
|
| 179 |
+
name="starcoder_typescript",
|
| 180 |
+
hf_name="bigcode/starcoderdata",
|
| 181 |
+
hf_subset="typescript",
|
| 182 |
+
hf_split="train",
|
| 183 |
+
target_count=50_000,
|
| 184 |
+
output_file="starcoderdata.jsonl", # appends to same file
|
| 185 |
+
stage=2,
|
| 186 |
+
est_size_gb=1.0,
|
| 187 |
+
description="StarCoder TypeScript code",
|
| 188 |
+
languages=["typescript"],
|
| 189 |
+
),
|
| 190 |
+
# Stage 3 β Large (overnight)
|
| 191 |
+
DatasetConfig(
|
| 192 |
+
name="websight",
|
| 193 |
+
hf_name="HuggingFaceM4/WebSight",
|
| 194 |
+
hf_subset="v0.2",
|
| 195 |
+
hf_split="train",
|
| 196 |
+
target_count=200_000,
|
| 197 |
+
output_file="websight.jsonl",
|
| 198 |
+
stage=3,
|
| 199 |
+
est_size_gb=8.0,
|
| 200 |
+
description="Screenshots + HTML code pairs",
|
| 201 |
+
),
|
| 202 |
+
# Synthetic β No download needed
|
| 203 |
+
DatasetConfig(
|
| 204 |
+
name="synthetic_nextjs",
|
| 205 |
+
hf_name="",
|
| 206 |
+
hf_subset=None,
|
| 207 |
+
hf_split="",
|
| 208 |
+
target_count=30_000,
|
| 209 |
+
output_file="synthetic_nextjs.jsonl",
|
| 210 |
+
stage=0,
|
| 211 |
+
est_size_gb=0.2,
|
| 212 |
+
description="Synthetic Next.js components with MINDI format",
|
| 213 |
+
is_synthetic=True,
|
| 214 |
+
),
|
| 215 |
+
DatasetConfig(
|
| 216 |
+
name="search_examples",
|
| 217 |
+
hf_name="",
|
| 218 |
+
hf_subset=None,
|
| 219 |
+
hf_split="",
|
| 220 |
+
target_count=5_000,
|
| 221 |
+
output_file="search_examples.jsonl",
|
| 222 |
+
stage=0,
|
| 223 |
+
est_size_gb=0.03,
|
| 224 |
+
description="MINDI search usage examples",
|
| 225 |
+
is_synthetic=True,
|
| 226 |
+
),
|
| 227 |
+
DatasetConfig(
|
| 228 |
+
name="sandbox_examples",
|
| 229 |
+
hf_name="",
|
| 230 |
+
hf_subset=None,
|
| 231 |
+
hf_split="",
|
| 232 |
+
target_count=3_000,
|
| 233 |
+
output_file="sandbox_examples.jsonl",
|
| 234 |
+
stage=0,
|
| 235 |
+
est_size_gb=0.02,
|
| 236 |
+
description="MINDI sandbox error-fix examples",
|
| 237 |
+
is_synthetic=True,
|
| 238 |
+
),
|
| 239 |
+
]
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# ββ Retry helper ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 243 |
+
def retry_with_backoff(fn, max_retries: int = 5, base_delay: float = 2.0):
|
| 244 |
+
"""Call fn() with exponential backoff on failure."""
|
| 245 |
+
for attempt in range(max_retries):
|
| 246 |
+
try:
|
| 247 |
+
return fn()
|
| 248 |
+
except Exception as e:
|
| 249 |
+
if attempt == max_retries - 1:
|
| 250 |
+
raise
|
| 251 |
+
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
| 252 |
+
log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s...")
|
| 253 |
+
time.sleep(delay)
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
# ββ HuggingFace download βββββββββββββββββββββββββββββββββββββββββββββ
|
| 257 |
+
def download_hf_dataset(
|
| 258 |
+
config: DatasetConfig,
|
| 259 |
+
checkpoint: CheckpointManager,
|
| 260 |
+
progress: Progress,
|
| 261 |
+
) -> int:
|
| 262 |
+
"""Download a HuggingFace dataset with streaming and save as JSONL."""
|
| 263 |
+
from datasets import load_dataset
|
| 264 |
+
|
| 265 |
+
output_path = DATA_RAW / config.output_file
|
| 266 |
+
resume_count = checkpoint.get_resume_count(config.name)
|
| 267 |
+
|
| 268 |
+
# For starcoder subsets that share an output file, use append mode
|
| 269 |
+
# but only if this specific subset hasn't been completed
|
| 270 |
+
is_append = config.output_file == "starcoderdata.jsonl" and output_path.exists()
|
| 271 |
+
mode = "a" if is_append else "w"
|
| 272 |
+
if not is_append and resume_count == 0:
|
| 273 |
+
mode = "w"
|
| 274 |
+
elif resume_count > 0:
|
| 275 |
+
mode = "a"
|
| 276 |
+
log.info(f"Resuming {config.name} from example {resume_count:,}")
|
| 277 |
+
|
| 278 |
+
task = progress.add_task(
|
| 279 |
+
f"[cyan]{config.name}",
|
| 280 |
+
total=config.target_count,
|
| 281 |
+
completed=resume_count,
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
log.info(f"Loading {config.hf_name} (subset={config.hf_subset}, split={config.hf_split}) streaming=True")
|
| 285 |
+
|
| 286 |
+
def _load():
|
| 287 |
+
kwargs = {
|
| 288 |
+
"path": config.hf_name,
|
| 289 |
+
"split": config.hf_split,
|
| 290 |
+
"streaming": True,
|
| 291 |
+
"trust_remote_code": True,
|
| 292 |
+
}
|
| 293 |
+
if config.hf_subset:
|
| 294 |
+
kwargs["name"] = config.hf_subset
|
| 295 |
+
return load_dataset(**kwargs)
|
| 296 |
+
|
| 297 |
+
ds = retry_with_backoff(_load)
|
| 298 |
+
|
| 299 |
+
count = 0
|
| 300 |
+
skipped = 0
|
| 301 |
+
with open(output_path, mode, encoding="utf-8") as f:
|
| 302 |
+
for example in ds:
|
| 303 |
+
if count < resume_count:
|
| 304 |
+
count += 1
|
| 305 |
+
continue
|
| 306 |
+
|
| 307 |
+
# Write raw example as JSONL
|
| 308 |
+
try:
|
| 309 |
+
line = json.dumps(example, ensure_ascii=False, default=str)
|
| 310 |
+
f.write(line + "\n")
|
| 311 |
+
except (TypeError, ValueError) as e:
|
| 312 |
+
skipped += 1
|
| 313 |
+
continue
|
| 314 |
+
|
| 315 |
+
count += 1
|
| 316 |
+
progress.update(task, completed=count)
|
| 317 |
+
|
| 318 |
+
# Periodic checkpoint every 5000 examples
|
| 319 |
+
if count % 5000 == 0:
|
| 320 |
+
checkpoint.mark_in_progress(config.name, count)
|
| 321 |
+
f.flush()
|
| 322 |
+
|
| 323 |
+
if count >= config.target_count:
|
| 324 |
+
break
|
| 325 |
+
|
| 326 |
+
size_mb = output_path.stat().st_size / (1024 * 1024)
|
| 327 |
+
log.info(f"β
{config.name}: {count:,} examples, {size_mb:.1f} MB (skipped {skipped})")
|
| 328 |
+
progress.update(task, completed=count)
|
| 329 |
+
return count
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
# ββ Synthetic generators ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 333 |
+
|
| 334 |
+
# Component templates for synthetic Next.js data
|
| 335 |
+
COMPONENT_TYPES = [
|
| 336 |
+
"Navbar", "Hero", "Footer", "Sidebar", "Card", "Modal", "Dropdown",
|
| 337 |
+
"Accordion", "Tabs", "Carousel", "Pagination", "Breadcrumb", "Alert",
|
| 338 |
+
"Toast", "Badge", "Avatar", "Tooltip", "Popover", "Progress", "Spinner",
|
| 339 |
+
"Skeleton", "Table", "Form", "Input", "Select", "Checkbox", "Radio",
|
| 340 |
+
"Switch", "Slider", "DatePicker", "FileUpload", "SearchBar", "CommandPalette",
|
| 341 |
+
"DataTable", "Chart", "Calendar", "Timeline", "Stepper", "Rating",
|
| 342 |
+
"PricingCard", "TestimonialCard", "FeatureGrid", "StatsSection",
|
| 343 |
+
"CTASection", "Newsletter", "LoginForm", "SignupForm", "ProfileCard",
|
| 344 |
+
"DashboardLayout", "SettingsPanel", "NotificationList", "ChatBubble",
|
| 345 |
+
]
|
| 346 |
+
|
| 347 |
+
TAILWIND_COLORS = [
|
| 348 |
+
"slate", "gray", "zinc", "neutral", "stone", "red", "orange", "amber",
|
| 349 |
+
"yellow", "lime", "green", "emerald", "teal", "cyan", "sky", "blue",
|
| 350 |
+
"indigo", "violet", "purple", "fuchsia", "pink", "rose",
|
| 351 |
+
]
|
| 352 |
+
|
| 353 |
+
DESIGN_PATTERNS = [
|
| 354 |
+
"responsive grid layout", "flexbox centering", "gradient background",
|
| 355 |
+
"glassmorphism effect", "dark mode support", "animated entrance",
|
| 356 |
+
"hover transitions", "skeleton loading state", "error boundary",
|
| 357 |
+
"lazy loading", "infinite scroll", "drag and drop", "keyboard navigation",
|
| 358 |
+
"focus management", "scroll animations", "parallax effect",
|
| 359 |
+
]
|
| 360 |
+
|
| 361 |
+
USER_REQUESTS = [
|
| 362 |
+
"Build me a {component} component with {pattern}",
|
| 363 |
+
"Create a modern {component} using Tailwind CSS with {color} theme",
|
| 364 |
+
"I need a {component} that supports dark mode and is fully accessible",
|
| 365 |
+
"Design a {component} with smooth animations and {pattern}",
|
| 366 |
+
"Make a responsive {component} component for a SaaS dashboard",
|
| 367 |
+
"Build a {component} with TypeScript and proper prop types",
|
| 368 |
+
"Create a reusable {component} with {pattern} for a landing page",
|
| 369 |
+
"I want a {component} that looks like the latest {color} design trend",
|
| 370 |
+
"Generate a production-ready {component} with {pattern}",
|
| 371 |
+
"Build a {component} component with Framer Motion animations",
|
| 372 |
+
]
|
| 373 |
+
|
| 374 |
+
CRITIQUE_TEMPLATES = [
|
| 375 |
+
"Visual Analysis:\n- β
Layout: Clean {pattern} implementation\n- β
Typography: Proper hierarchy with {color} accent colors\n- β οΈ Accessibility: Consider adding aria-labels to interactive elements\n- β
Responsiveness: Works across breakpoints",
|
| 376 |
+
"Design Review:\n- β
Color scheme: {color} palette creates good visual harmony\n- β
Spacing: Consistent padding and margins\n- β οΈ Touch targets: Buttons should be at least 44px for mobile\n- β
Visual hierarchy: Clear flow from header to content",
|
| 377 |
+
"UI/UX Assessment:\n- β
{pattern}: Well implemented with smooth transitions\n- β
Contrast: Text is readable against background\n- β οΈ Loading state: Consider adding skeleton screens\n- β
Component structure: Clean separation of concerns",
|
| 378 |
+
]
|
| 379 |
+
|
| 380 |
+
SUGGEST_TEMPLATES = [
|
| 381 |
+
"Improvements for next iteration:\n1. Add aria-label attributes for screen readers\n2. Implement keyboard navigation (Tab, Enter, Escape)\n3. Add loading skeleton state\n4. Consider adding subtle micro-interactions on hover",
|
| 382 |
+
"Suggestions:\n1. Add error boundary wrapper for production safety\n2. Implement responsive breakpoints for sm/md/lg/xl\n3. Add unit tests with @testing-library/react\n4. Consider extracting reusable hooks for state logic",
|
| 383 |
+
"Next steps:\n1. Add dark mode toggle using next-themes\n2. Optimize images with next/image component\n3. Add Storybook stories for documentation\n4. Implement proper TypeScript discriminated unions for variants",
|
| 384 |
+
]
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
def _generate_code_block(component: str, color: str) -> str:
|
| 388 |
+
"""Generate a realistic Next.js component code block."""
|
| 389 |
+
props_name = f"{component}Props"
|
| 390 |
+
variants = ["default", "primary", "secondary", "outline", "ghost"]
|
| 391 |
+
variant = random.choice(variants)
|
| 392 |
+
|
| 393 |
+
code = f"""'use client';
|
| 394 |
+
|
| 395 |
+
import {{ useState }} from 'react';
|
| 396 |
+
import {{ cn }} from '@/lib/utils';
|
| 397 |
+
|
| 398 |
+
interface {props_name} {{
|
| 399 |
+
variant?: '{variant}' | 'default';
|
| 400 |
+
className?: string;
|
| 401 |
+
children?: React.ReactNode;
|
| 402 |
+
}}
|
| 403 |
+
|
| 404 |
+
export default function {component}({{ variant = 'default', className, children }}: {props_name}) {{
|
| 405 |
+
const [isActive, setIsActive] = useState(false);
|
| 406 |
+
|
| 407 |
+
return (
|
| 408 |
+
<div
|
| 409 |
+
className={{cn(
|
| 410 |
+
'rounded-lg border p-4 transition-all duration-200',
|
| 411 |
+
variant === '{variant}' && 'bg-{color}-50 border-{color}-200 text-{color}-900',
|
| 412 |
+
variant === 'default' && 'bg-white border-gray-200 text-gray-900',
|
| 413 |
+
isActive && 'ring-2 ring-{color}-500 shadow-lg',
|
| 414 |
+
className
|
| 415 |
+
)}}
|
| 416 |
+
onClick={{() => setIsActive(!isActive)}}
|
| 417 |
+
role="button"
|
| 418 |
+
tabIndex={{0}}
|
| 419 |
+
onKeyDown={{(e) => e.key === 'Enter' && setIsActive(!isActive)}}
|
| 420 |
+
>
|
| 421 |
+
<div className="flex items-center justify-between">
|
| 422 |
+
<h3 className="text-lg font-semibold">{component}</h3>
|
| 423 |
+
<span className="text-sm text-{color}-600">{{variant}}</span>
|
| 424 |
+
</div>
|
| 425 |
+
<div className="mt-2 text-sm text-gray-600">
|
| 426 |
+
{{children}}
|
| 427 |
+
</div>
|
| 428 |
+
</div>
|
| 429 |
+
);
|
| 430 |
+
}}"""
|
| 431 |
+
return code
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
def generate_synthetic_nextjs(count: int, progress: Progress) -> Generator[dict, None, None]:
|
| 435 |
+
"""Generate synthetic Next.js training examples in MINDI format."""
|
| 436 |
+
task = progress.add_task("[magenta]synthetic_nextjs", total=count)
|
| 437 |
+
|
| 438 |
+
for i in range(count):
|
| 439 |
+
component = random.choice(COMPONENT_TYPES)
|
| 440 |
+
color = random.choice(TAILWIND_COLORS)
|
| 441 |
+
pattern = random.choice(DESIGN_PATTERNS)
|
| 442 |
+
|
| 443 |
+
request_template = random.choice(USER_REQUESTS)
|
| 444 |
+
user_request = request_template.format(
|
| 445 |
+
component=component, color=color, pattern=pattern
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
code = _generate_code_block(component, color)
|
| 449 |
+
filename = f"src/components/{component}.tsx"
|
| 450 |
+
|
| 451 |
+
thinking = (
|
| 452 |
+
f"The user wants a {component} component. I need to:\n"
|
| 453 |
+
f"1. Create a TypeScript component with proper prop types\n"
|
| 454 |
+
f"2. Use Tailwind CSS with {color} color scheme\n"
|
| 455 |
+
f"3. Implement {pattern}\n"
|
| 456 |
+
f"4. Ensure accessibility with ARIA attributes\n"
|
| 457 |
+
f"5. Add keyboard navigation support"
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
critique = random.choice(CRITIQUE_TEMPLATES).format(
|
| 461 |
+
pattern=pattern, color=color
|
| 462 |
+
)
|
| 463 |
+
suggestions = random.choice(SUGGEST_TEMPLATES)
|
| 464 |
+
|
| 465 |
+
assistant_content = (
|
| 466 |
+
f"<|think_start|>\n{thinking}\n<|think_end|>\n\n"
|
| 467 |
+
f"<|file_start|>\npath: {filename}\nlanguage: typescript\nframework: next.js 14\n<|file_end|>\n\n"
|
| 468 |
+
f"<|code_start|>\n{code}\n<|code_end|>\n\n"
|
| 469 |
+
f"<|critique_start|>\n{critique}\n<|critique_end|>\n\n"
|
| 470 |
+
f"<|suggest_start|>\n{suggestions}\n<|suggest_end|>"
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
yield {
|
| 474 |
+
"id": f"synthetic_{i:06d}",
|
| 475 |
+
"source": "synthetic_nextjs",
|
| 476 |
+
"user": user_request,
|
| 477 |
+
"assistant": assistant_content,
|
| 478 |
+
"component": component,
|
| 479 |
+
"language": "typescript",
|
| 480 |
+
"framework": "nextjs",
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
progress.update(task, completed=i + 1)
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
def generate_search_examples(count: int, progress: Progress) -> Generator[dict, None, None]:
|
| 487 |
+
"""Generate synthetic search usage examples."""
|
| 488 |
+
task = progress.add_task("[yellow]search_examples", total=count)
|
| 489 |
+
|
| 490 |
+
search_scenarios = [
|
| 491 |
+
("How to implement dark mode in Next.js 14?", "next.js 14 dark mode implementation next-themes"),
|
| 492 |
+
("Best practices for React form validation", "react form validation zod react-hook-form 2025"),
|
| 493 |
+
("How to set up authentication in Next.js?", "next.js 14 authentication NextAuth.js credentials"),
|
| 494 |
+
("Tailwind CSS animation examples", "tailwind css animation keyframes framer-motion"),
|
| 495 |
+
("How to optimize images in Next.js?", "next.js image optimization next/image blur placeholder"),
|
| 496 |
+
("React server components best practices", "react server components RSC data fetching patterns"),
|
| 497 |
+
("How to deploy Next.js to Vercel?", "next.js 14 vercel deployment environment variables"),
|
| 498 |
+
("TypeScript utility types for React", "typescript react utility types ComponentProps PropsWithChildren"),
|
| 499 |
+
("How to use Zustand for state management?", "zustand state management react next.js middleware"),
|
| 500 |
+
("CSS Grid vs Flexbox for layouts", "css grid flexbox responsive layout patterns 2025"),
|
| 501 |
+
("How to implement infinite scroll?", "react infinite scroll intersection observer tanstack query"),
|
| 502 |
+
("Next.js API routes best practices", "next.js 14 route handlers API validation zod"),
|
| 503 |
+
("How to add SEO to Next.js?", "next.js 14 metadata SEO generateMetadata open graph"),
|
| 504 |
+
("React testing best practices", "react testing library jest vitest component testing"),
|
| 505 |
+
("How to use Prisma with Next.js?", "prisma next.js 14 database postgresql schema"),
|
| 506 |
+
]
|
| 507 |
+
|
| 508 |
+
packages_db = [
|
| 509 |
+
("framer-motion", "Production-ready motion library for React", "npm i framer-motion"),
|
| 510 |
+
("next-themes", "Dark mode for Next.js apps", "npm i next-themes"),
|
| 511 |
+
("zustand", "Small, fast state management", "npm i zustand"),
|
| 512 |
+
("@tanstack/react-query", "Powerful data synchronization", "npm i @tanstack/react-query"),
|
| 513 |
+
("react-hook-form", "Performant forms with validation", "npm i react-hook-form"),
|
| 514 |
+
("zod", "TypeScript-first schema validation", "npm i zod"),
|
| 515 |
+
("tailwind-merge", "Merge Tailwind classes without conflicts", "npm i tailwind-merge"),
|
| 516 |
+
("clsx", "Tiny utility for constructing className strings", "npm i clsx"),
|
| 517 |
+
("lucide-react", "Beautiful SVG icons for React", "npm i lucide-react"),
|
| 518 |
+
("@radix-ui/react-dialog", "Accessible dialog component", "npm i @radix-ui/react-dialog"),
|
| 519 |
+
]
|
| 520 |
+
|
| 521 |
+
for i in range(count):
|
| 522 |
+
scenario = search_scenarios[i % len(search_scenarios)]
|
| 523 |
+
pkg = packages_db[i % len(packages_db)]
|
| 524 |
+
user_q = scenario[0]
|
| 525 |
+
search_query = scenario[1]
|
| 526 |
+
|
| 527 |
+
assistant_content = (
|
| 528 |
+
f"<|think_start|>\nThe user is asking about {user_q.lower().rstrip('?')}. "
|
| 529 |
+
f"Let me search for the latest best practices.\n<|think_end|>\n\n"
|
| 530 |
+
f"<|search_start|>\nquery: \"{search_query}\"\n"
|
| 531 |
+
f"results: [\n"
|
| 532 |
+
f" {{\"title\": \"Official Documentation\", \"url\": \"https://docs.example.com\", \"snippet\": \"Comprehensive guide...\"}},\n"
|
| 533 |
+
f" {{\"title\": \"Best Practices 2025\", \"url\": \"https://blog.example.com\", \"snippet\": \"Updated approach...\"}}\n"
|
| 534 |
+
f"]\n<|search_end|>\n\n"
|
| 535 |
+
f"Based on my research, here's the recommended approach:\n\n"
|
| 536 |
+
f"First, install the required package:\n```bash\n{pkg[2]}\n```\n\n"
|
| 537 |
+
f"**{pkg[0]}** β {pkg[1]}\n\n"
|
| 538 |
+
f"<|code_start|>\n"
|
| 539 |
+
f"// Example usage of {pkg[0]}\n"
|
| 540 |
+
f"import {{ /* relevant imports */ }} from '{pkg[0]}';\n\n"
|
| 541 |
+
f"export default function Example() {{\n"
|
| 542 |
+
f" // Implementation based on search results\n"
|
| 543 |
+
f" return <div>Example using {pkg[0]}</div>;\n"
|
| 544 |
+
f"}}\n"
|
| 545 |
+
f"<|code_end|>"
|
| 546 |
+
)
|
| 547 |
+
|
| 548 |
+
yield {
|
| 549 |
+
"id": f"search_{i:06d}",
|
| 550 |
+
"source": "search_examples",
|
| 551 |
+
"user": user_q,
|
| 552 |
+
"assistant": assistant_content,
|
| 553 |
+
"search_query": search_query,
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
progress.update(task, completed=i + 1)
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
def generate_sandbox_examples(count: int, progress: Progress) -> Generator[dict, None, None]:
|
| 560 |
+
"""Generate synthetic sandbox error-fix examples."""
|
| 561 |
+
task = progress.add_task("[red]sandbox_examples", total=count)
|
| 562 |
+
|
| 563 |
+
error_scenarios = [
|
| 564 |
+
{
|
| 565 |
+
"error": "TypeError: Cannot read properties of undefined (reading 'map')",
|
| 566 |
+
"cause": "Data array is undefined on initial render before API response",
|
| 567 |
+
"fix": "Add optional chaining and fallback: data?.items?.map(...) ?? []",
|
| 568 |
+
"file": "src/components/DataList.tsx",
|
| 569 |
+
},
|
| 570 |
+
{
|
| 571 |
+
"error": "Error: Hydration failed because the initial UI does not match what was rendered on the server",
|
| 572 |
+
"cause": "Using browser-only APIs (window, localStorage) during server render",
|
| 573 |
+
"fix": "Wrap in useEffect or use dynamic import with ssr: false",
|
| 574 |
+
"file": "src/components/ThemeProvider.tsx",
|
| 575 |
+
},
|
| 576 |
+
{
|
| 577 |
+
"error": "Module not found: Can't resolve '@/components/ui/button'",
|
| 578 |
+
"cause": "Path alias not configured in tsconfig.json",
|
| 579 |
+
"fix": "Add paths mapping in tsconfig.json: '@/*': ['./src/*']",
|
| 580 |
+
"file": "tsconfig.json",
|
| 581 |
+
},
|
| 582 |
+
{
|
| 583 |
+
"error": "Warning: Each child in a list should have a unique 'key' prop",
|
| 584 |
+
"cause": "Missing key prop in .map() iteration",
|
| 585 |
+
"fix": "Add key={item.id} to the mapped JSX element",
|
| 586 |
+
"file": "src/components/ItemList.tsx",
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"error": "TypeError: fetch failed - ECONNREFUSED",
|
| 590 |
+
"cause": "API endpoint is unreachable or CORS is not configured",
|
| 591 |
+
"fix": "Use Next.js API route as proxy, add CORS headers",
|
| 592 |
+
"file": "src/app/api/proxy/route.ts",
|
| 593 |
+
},
|
| 594 |
+
{
|
| 595 |
+
"error": "Error: Invalid hook call. Hooks can only be called inside of the body of a function component",
|
| 596 |
+
"cause": "Calling useState inside a conditional or nested function",
|
| 597 |
+
"fix": "Move hook call to the top level of the component function",
|
| 598 |
+
"file": "src/hooks/useAuth.ts",
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
"error": "Build error: Type 'string | undefined' is not assignable to type 'string'",
|
| 602 |
+
"cause": "Environment variable might be undefined at build time",
|
| 603 |
+
"fix": "Add non-null assertion or provide default value with ?? ''",
|
| 604 |
+
"file": "src/lib/config.ts",
|
| 605 |
+
},
|
| 606 |
+
{
|
| 607 |
+
"error": "Warning: validateDOMNesting(...): <div> cannot appear as a descendant of <p>",
|
| 608 |
+
"cause": "Invalid HTML nesting - block element inside inline element",
|
| 609 |
+
"fix": "Change outer <p> to <div> or inner <div> to <span>",
|
| 610 |
+
"file": "src/components/Card.tsx",
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"error": "Error: NEXT_REDIRECT in API route",
|
| 614 |
+
"cause": "Using redirect() in a try/catch block catches the redirect error",
|
| 615 |
+
"fix": "Move redirect() call outside of try/catch, or re-throw NEXT_REDIRECT",
|
| 616 |
+
"file": "src/app/api/auth/route.ts",
|
| 617 |
+
},
|
| 618 |
+
{
|
| 619 |
+
"error": "Unhandled Runtime Error: Maximum update depth exceeded",
|
| 620 |
+
"cause": "useEffect dependency causes infinite re-render loop",
|
| 621 |
+
"fix": "Memoize the dependency with useMemo or useCallback",
|
| 622 |
+
"file": "src/hooks/useData.ts",
|
| 623 |
+
},
|
| 624 |
+
]
|
| 625 |
+
|
| 626 |
+
for i in range(count):
|
| 627 |
+
scenario = error_scenarios[i % len(error_scenarios)]
|
| 628 |
+
|
| 629 |
+
assistant_content = (
|
| 630 |
+
f"<|think_start|>\n"
|
| 631 |
+
f"I see a build error. Let me analyze:\n"
|
| 632 |
+
f"Error: {scenario['error']}\n"
|
| 633 |
+
f"Root cause: {scenario['cause']}\n"
|
| 634 |
+
f"I need to fix this in {scenario['file']}\n"
|
| 635 |
+
f"<|think_end|>\n\n"
|
| 636 |
+
f"<|error_start|>\n"
|
| 637 |
+
f"File: {scenario['file']}\n"
|
| 638 |
+
f"Error: {scenario['error']}\n"
|
| 639 |
+
f"<|error_end|>\n\n"
|
| 640 |
+
f"<|sandbox_start|>\n"
|
| 641 |
+
f"Running: npm run build\n"
|
| 642 |
+
f"Status: FAILED\n"
|
| 643 |
+
f"Exit code: 1\n"
|
| 644 |
+
f"<|sandbox_end|>\n\n"
|
| 645 |
+
f"<|fix_start|>\n"
|
| 646 |
+
f"Root cause: {scenario['cause']}\n"
|
| 647 |
+
f"Solution: {scenario['fix']}\n"
|
| 648 |
+
f"<|fix_end|>\n\n"
|
| 649 |
+
f"<|file_start|>\npath: {scenario['file']}\nlanguage: typescript\n<|file_end|>\n\n"
|
| 650 |
+
f"<|code_start|>\n"
|
| 651 |
+
f"// Fixed version of {scenario['file']}\n"
|
| 652 |
+
f"// Applied fix: {scenario['fix']}\n"
|
| 653 |
+
f"export default function Fixed() {{\n"
|
| 654 |
+
f" // Corrected implementation\n"
|
| 655 |
+
f" return <div>Fixed component</div>;\n"
|
| 656 |
+
f"}}\n"
|
| 657 |
+
f"<|code_end|>\n\n"
|
| 658 |
+
f"<|sandbox_start|>\n"
|
| 659 |
+
f"Running: npm run build\n"
|
| 660 |
+
f"Status: SUCCESS\n"
|
| 661 |
+
f"Exit code: 0\n"
|
| 662 |
+
f"<|sandbox_end|>"
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
+
yield {
|
| 666 |
+
"id": f"sandbox_{i:06d}",
|
| 667 |
+
"source": "sandbox_examples",
|
| 668 |
+
"user": f"I'm getting this error: {scenario['error']}",
|
| 669 |
+
"assistant": assistant_content,
|
| 670 |
+
"error_type": scenario["error"][:50],
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
progress.update(task, completed=i + 1)
|
| 674 |
+
|
| 675 |
+
|
| 676 |
+
def write_synthetic(
|
| 677 |
+
config: DatasetConfig,
|
| 678 |
+
checkpoint: CheckpointManager,
|
| 679 |
+
progress: Progress,
|
| 680 |
+
) -> int:
|
| 681 |
+
"""Generate and write synthetic data."""
|
| 682 |
+
output_path = DATA_RAW / config.output_file
|
| 683 |
+
|
| 684 |
+
generators = {
|
| 685 |
+
"synthetic_nextjs": generate_synthetic_nextjs,
|
| 686 |
+
"search_examples": generate_search_examples,
|
| 687 |
+
"sandbox_examples": generate_sandbox_examples,
|
| 688 |
+
}
|
| 689 |
+
|
| 690 |
+
gen_fn = generators[config.name]
|
| 691 |
+
count = 0
|
| 692 |
+
|
| 693 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 694 |
+
for example in gen_fn(config.target_count, progress):
|
| 695 |
+
f.write(json.dumps(example, ensure_ascii=False) + "\n")
|
| 696 |
+
count += 1
|
| 697 |
+
|
| 698 |
+
size_mb = output_path.stat().st_size / (1024 * 1024)
|
| 699 |
+
log.info(f"β
{config.name}: {count:,} examples, {size_mb:.1f} MB")
|
| 700 |
+
return count
|
| 701 |
+
|
| 702 |
+
|
| 703 |
+
# ββ Disk space check ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 704 |
+
def check_disk_space(datasets: list[DatasetConfig]) -> bool:
|
| 705 |
+
"""Verify enough disk space for planned downloads."""
|
| 706 |
+
import shutil
|
| 707 |
+
|
| 708 |
+
total_est_gb = sum(d.est_size_gb for d in datasets)
|
| 709 |
+
usage = shutil.disk_usage(str(DATA_RAW))
|
| 710 |
+
free_gb = usage.free / (1024 ** 3)
|
| 711 |
+
|
| 712 |
+
table = Table(title="πΎ Disk Space Estimate")
|
| 713 |
+
table.add_column("Item", style="cyan")
|
| 714 |
+
table.add_column("Size", justify="right", style="green")
|
| 715 |
+
|
| 716 |
+
for d in datasets:
|
| 717 |
+
table.add_row(d.name, f"{d.est_size_gb:.2f} GB")
|
| 718 |
+
|
| 719 |
+
table.add_row("β" * 20, "β" * 10, style="dim")
|
| 720 |
+
table.add_row("Total estimated", f"{total_est_gb:.2f} GB", style="bold")
|
| 721 |
+
table.add_row("Available", f"{free_gb:.1f} GB", style="bold green")
|
| 722 |
+
table.add_row(
|
| 723 |
+
"After download",
|
| 724 |
+
f"~{free_gb - total_est_gb:.1f} GB",
|
| 725 |
+
style="bold yellow" if free_gb - total_est_gb > 50 else "bold red",
|
| 726 |
+
)
|
| 727 |
+
|
| 728 |
+
console.print(table)
|
| 729 |
+
|
| 730 |
+
if total_est_gb > free_gb * 0.8:
|
| 731 |
+
log.error(f"Not enough disk space! Need {total_est_gb:.1f} GB, have {free_gb:.1f} GB")
|
| 732 |
+
return False
|
| 733 |
+
|
| 734 |
+
return True
|
| 735 |
+
|
| 736 |
+
|
| 737 |
+
# ββ Main pipeline βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 738 |
+
def run_pipeline(
|
| 739 |
+
stage: Optional[int] = None,
|
| 740 |
+
dataset_name: Optional[str] = None,
|
| 741 |
+
synthetic_only: bool = False,
|
| 742 |
+
) -> None:
|
| 743 |
+
"""Run the download pipeline."""
|
| 744 |
+
console.print(Panel.fit(
|
| 745 |
+
"[bold cyan]MINDI 1.5 Vision-Coder β Dataset Download Pipeline[/]\n"
|
| 746 |
+
"[dim]Day 2 Step 1: Download 500K+ training examples[/]",
|
| 747 |
+
border_style="cyan",
|
| 748 |
+
))
|
| 749 |
+
|
| 750 |
+
checkpoint = CheckpointManager()
|
| 751 |
+
|
| 752 |
+
# Filter datasets based on args
|
| 753 |
+
if dataset_name:
|
| 754 |
+
targets = [d for d in DATASETS if d.name == dataset_name]
|
| 755 |
+
if not targets:
|
| 756 |
+
log.error(f"Unknown dataset: {dataset_name}. Available: {[d.name for d in DATASETS]}")
|
| 757 |
+
return
|
| 758 |
+
elif synthetic_only:
|
| 759 |
+
targets = [d for d in DATASETS if d.is_synthetic]
|
| 760 |
+
elif stage is not None:
|
| 761 |
+
targets = [d for d in DATASETS if d.stage == stage or (stage == 0 and d.is_synthetic)]
|
| 762 |
+
else:
|
| 763 |
+
targets = DATASETS
|
| 764 |
+
|
| 765 |
+
# Show plan
|
| 766 |
+
plan_table = Table(title="π Download Plan")
|
| 767 |
+
plan_table.add_column("Dataset", style="cyan")
|
| 768 |
+
plan_table.add_column("Examples", justify="right")
|
| 769 |
+
plan_table.add_column("Est. Size", justify="right")
|
| 770 |
+
plan_table.add_column("Stage")
|
| 771 |
+
plan_table.add_column("Status")
|
| 772 |
+
|
| 773 |
+
for d in targets:
|
| 774 |
+
status = "β
Done" if checkpoint.is_complete(d.name) else "β³ Pending"
|
| 775 |
+
stage_label = f"Stage {d.stage}" if d.stage > 0 else "Synthetic"
|
| 776 |
+
plan_table.add_row(
|
| 777 |
+
d.name,
|
| 778 |
+
f"{d.target_count:,}",
|
| 779 |
+
f"{d.est_size_gb:.2f} GB",
|
| 780 |
+
stage_label,
|
| 781 |
+
status,
|
| 782 |
+
)
|
| 783 |
+
|
| 784 |
+
console.print(plan_table)
|
| 785 |
+
|
| 786 |
+
# Check disk space
|
| 787 |
+
pending = [d for d in targets if not checkpoint.is_complete(d.name)]
|
| 788 |
+
if not pending:
|
| 789 |
+
console.print("\n[bold green]β
All requested datasets already downloaded![/]")
|
| 790 |
+
_print_summary(checkpoint)
|
| 791 |
+
return
|
| 792 |
+
|
| 793 |
+
if not check_disk_space(pending):
|
| 794 |
+
return
|
| 795 |
+
|
| 796 |
+
# Download with progress
|
| 797 |
+
console.print()
|
| 798 |
+
with Progress(
|
| 799 |
+
SpinnerColumn(),
|
| 800 |
+
TextColumn("[progress.description]{task.description}"),
|
| 801 |
+
BarColumn(),
|
| 802 |
+
MofNCompleteColumn(),
|
| 803 |
+
TimeElapsedColumn(),
|
| 804 |
+
TimeRemainingColumn(),
|
| 805 |
+
console=console,
|
| 806 |
+
refresh_per_second=2,
|
| 807 |
+
) as progress:
|
| 808 |
+
for config in pending:
|
| 809 |
+
if checkpoint.is_complete(config.name):
|
| 810 |
+
log.info(f"Skipping {config.name} (already complete)")
|
| 811 |
+
continue
|
| 812 |
+
|
| 813 |
+
log.info(f"\n{'β' * 50}")
|
| 814 |
+
log.info(f"Starting: {config.name} β {config.description}")
|
| 815 |
+
|
| 816 |
+
try:
|
| 817 |
+
if config.is_synthetic:
|
| 818 |
+
count = write_synthetic(config, checkpoint, progress)
|
| 819 |
+
else:
|
| 820 |
+
count = download_hf_dataset(config, checkpoint, progress)
|
| 821 |
+
|
| 822 |
+
size_mb = (DATA_RAW / config.output_file).stat().st_size / (1024 * 1024)
|
| 823 |
+
checkpoint.mark_complete(config.name, count, size_mb)
|
| 824 |
+
|
| 825 |
+
except KeyboardInterrupt:
|
| 826 |
+
log.warning(f"\nβ οΈ Interrupted during {config.name}. Progress saved β rerun to resume.")
|
| 827 |
+
return
|
| 828 |
+
except Exception as e:
|
| 829 |
+
log.error(f"β Failed {config.name}: {e}")
|
| 830 |
+
log.error(traceback.format_exc())
|
| 831 |
+
continue
|
| 832 |
+
|
| 833 |
+
_print_summary(checkpoint)
|
| 834 |
+
|
| 835 |
+
|
| 836 |
+
def _print_summary(checkpoint: CheckpointManager) -> None:
|
| 837 |
+
"""Print final download summary."""
|
| 838 |
+
console.print()
|
| 839 |
+
summary = Table(title="π Download Summary")
|
| 840 |
+
summary.add_column("Dataset", style="cyan")
|
| 841 |
+
summary.add_column("Examples", justify="right")
|
| 842 |
+
summary.add_column("Size", justify="right")
|
| 843 |
+
summary.add_column("Time")
|
| 844 |
+
|
| 845 |
+
total_count = 0
|
| 846 |
+
total_mb = 0
|
| 847 |
+
for name, info in checkpoint.data["completed"].items():
|
| 848 |
+
summary.add_row(
|
| 849 |
+
name,
|
| 850 |
+
f"{info['count']:,}",
|
| 851 |
+
f"{info['size_mb']:.1f} MB",
|
| 852 |
+
info.get("timestamp", ""),
|
| 853 |
+
)
|
| 854 |
+
total_count += info["count"]
|
| 855 |
+
total_mb += info["size_mb"]
|
| 856 |
+
|
| 857 |
+
summary.add_row("β" * 20, "β" * 10, "β" * 10, "β" * 15, style="dim")
|
| 858 |
+
summary.add_row(
|
| 859 |
+
"[bold]TOTAL[/]",
|
| 860 |
+
f"[bold]{total_count:,}[/]",
|
| 861 |
+
f"[bold]{total_mb:.1f} MB[/]",
|
| 862 |
+
"",
|
| 863 |
+
style="bold green",
|
| 864 |
+
)
|
| 865 |
+
|
| 866 |
+
console.print(summary)
|
| 867 |
+
|
| 868 |
+
if total_count >= 500_000:
|
| 869 |
+
console.print("\n[bold green]π TARGET REACHED: 500K+ examples downloaded![/]")
|
| 870 |
+
else:
|
| 871 |
+
remaining = 500_000 - total_count
|
| 872 |
+
console.print(f"\n[yellow]β³ {remaining:,} more examples needed to reach 500K target[/]")
|
| 873 |
+
|
| 874 |
+
|
| 875 |
+
# ββ CLI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 876 |
+
def main() -> None:
|
| 877 |
+
parser = argparse.ArgumentParser(description="MINDI Dataset Download Pipeline")
|
| 878 |
+
parser.add_argument("--dataset", type=str, help="Download a specific dataset by name")
|
| 879 |
+
parser.add_argument("--stage", type=int, choices=[0, 1, 2, 3], help="Download a specific stage")
|
| 880 |
+
parser.add_argument("--synthetic", action="store_true", help="Generate synthetic data only")
|
| 881 |
+
args = parser.parse_args()
|
| 882 |
+
|
| 883 |
+
run_pipeline(
|
| 884 |
+
stage=args.stage,
|
| 885 |
+
dataset_name=args.dataset,
|
| 886 |
+
synthetic_only=args.synthetic,
|
| 887 |
+
)
|
| 888 |
+
|
| 889 |
+
|
| 890 |
+
if __name__ == "__main__":
|
| 891 |
+
main()
|
scripts/process_data.py
ADDED
|
@@ -0,0 +1,820 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MINDI 1.5 Vision-Coder β Day 2 Step 2: MINDI Format Converter
|
| 3 |
+
|
| 4 |
+
Converts ALL raw datasets (JSONL) into unified MINDI training format.
|
| 5 |
+
|
| 6 |
+
Each output example:
|
| 7 |
+
{
|
| 8 |
+
"id": "mindi_000001",
|
| 9 |
+
"type": "code_generation",
|
| 10 |
+
"source": "websight",
|
| 11 |
+
"messages": [
|
| 12 |
+
{"role": "system", "content": "..."},
|
| 13 |
+
{"role": "user", "content": "..."},
|
| 14 |
+
{"role": "assistant", "content": "<|think_start|>...<|think_end|>..."}
|
| 15 |
+
],
|
| 16 |
+
"metadata": {
|
| 17 |
+
"language": "typescript",
|
| 18 |
+
"framework": "nextjs",
|
| 19 |
+
"has_vision": false,
|
| 20 |
+
"tokens": 1024,
|
| 21 |
+
"quality_score": 8.5
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
Usage:
|
| 26 |
+
python scripts/process_data.py # Process all
|
| 27 |
+
python scripts/process_data.py --source codealpaca # Process one
|
| 28 |
+
python scripts/process_data.py --dry-run # Preview only
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
from __future__ import annotations
|
| 32 |
+
|
| 33 |
+
import argparse
|
| 34 |
+
import hashlib
|
| 35 |
+
import json
|
| 36 |
+
import logging
|
| 37 |
+
import random
|
| 38 |
+
import re
|
| 39 |
+
import sys
|
| 40 |
+
import time
|
| 41 |
+
from dataclasses import dataclass
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
from typing import Any, Generator, Optional
|
| 44 |
+
|
| 45 |
+
from rich.console import Console
|
| 46 |
+
from rich.logging import RichHandler
|
| 47 |
+
from rich.panel import Panel
|
| 48 |
+
from rich.progress import (
|
| 49 |
+
BarColumn,
|
| 50 |
+
MofNCompleteColumn,
|
| 51 |
+
Progress,
|
| 52 |
+
SpinnerColumn,
|
| 53 |
+
TextColumn,
|
| 54 |
+
TimeElapsedColumn,
|
| 55 |
+
TimeRemainingColumn,
|
| 56 |
+
)
|
| 57 |
+
from rich.table import Table
|
| 58 |
+
|
| 59 |
+
# ββ Paths βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 60 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 61 |
+
DATA_RAW = PROJECT_ROOT / "data" / "raw"
|
| 62 |
+
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
|
| 63 |
+
LOGS_DIR = PROJECT_ROOT / "logs"
|
| 64 |
+
TOKENIZER_PATH = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"
|
| 65 |
+
|
| 66 |
+
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
|
| 67 |
+
LOGS_DIR.mkdir(parents=True, exist_ok=True)
|
| 68 |
+
|
| 69 |
+
# ββ Logging βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 70 |
+
console = Console()
|
| 71 |
+
logging.basicConfig(
|
| 72 |
+
level=logging.INFO,
|
| 73 |
+
format="%(message)s",
|
| 74 |
+
datefmt="[%X]",
|
| 75 |
+
handlers=[
|
| 76 |
+
RichHandler(console=console, rich_tracebacks=True, show_path=False),
|
| 77 |
+
logging.FileHandler(LOGS_DIR / "process_data.log", encoding="utf-8"),
|
| 78 |
+
],
|
| 79 |
+
)
|
| 80 |
+
log = logging.getLogger("mindi.process")
|
| 81 |
+
|
| 82 |
+
# ββ System prompt βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 83 |
+
MINDI_SYSTEM_PROMPT = (
|
| 84 |
+
"You are MINDI 1.5 Vision-Coder, an AI built by MINDIGENOUS.AI. "
|
| 85 |
+
"You are an expert in Next.js 14, React, TypeScript, Tailwind CSS, "
|
| 86 |
+
"and UI/UX design. You see your own output and critique it to make "
|
| 87 |
+
"it better for the user."
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# ββ Tokenizer (lazy loaded) ββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
+
_tokenizer = None
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def get_tokenizer():
|
| 95 |
+
global _tokenizer
|
| 96 |
+
if _tokenizer is None:
|
| 97 |
+
from transformers import AutoTokenizer
|
| 98 |
+
_tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_PATH), trust_remote_code=True)
|
| 99 |
+
log.info(f"Loaded tokenizer (vocab={len(_tokenizer):,})")
|
| 100 |
+
return _tokenizer
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def count_tokens(text: str) -> int:
|
| 104 |
+
tok = get_tokenizer()
|
| 105 |
+
return len(tok.encode(text, add_special_tokens=False))
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# ββ Language detection ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 109 |
+
def detect_language(code: str, filename: str = "") -> str:
|
| 110 |
+
"""Detect programming language from code content or filename."""
|
| 111 |
+
ext_map = {
|
| 112 |
+
".py": "python", ".js": "javascript", ".jsx": "javascript",
|
| 113 |
+
".ts": "typescript", ".tsx": "typescript", ".html": "html",
|
| 114 |
+
".css": "css", ".json": "json", ".md": "markdown",
|
| 115 |
+
".rs": "rust", ".go": "go", ".java": "java", ".cpp": "cpp",
|
| 116 |
+
".c": "c", ".rb": "ruby", ".php": "php", ".swift": "swift",
|
| 117 |
+
".kt": "kotlin", ".sql": "sql", ".sh": "bash",
|
| 118 |
+
}
|
| 119 |
+
if filename:
|
| 120 |
+
ext = Path(filename).suffix.lower()
|
| 121 |
+
if ext in ext_map:
|
| 122 |
+
return ext_map[ext]
|
| 123 |
+
|
| 124 |
+
# Heuristic detection from content
|
| 125 |
+
if "import React" in code or "from 'react'" in code or "jsx" in code.lower():
|
| 126 |
+
return "typescript" if ": " in code and ("interface " in code or "type " in code) else "javascript"
|
| 127 |
+
if "def " in code and "import " in code and ":" in code:
|
| 128 |
+
return "python"
|
| 129 |
+
if "func " in code and "package " in code:
|
| 130 |
+
return "go"
|
| 131 |
+
if "fn " in code and "let mut" in code:
|
| 132 |
+
return "rust"
|
| 133 |
+
if "public class" in code or "public static void" in code:
|
| 134 |
+
return "java"
|
| 135 |
+
if "<!DOCTYPE" in code or "<html" in code:
|
| 136 |
+
return "html"
|
| 137 |
+
if "function " in code or "const " in code or "=>" in code:
|
| 138 |
+
return "javascript"
|
| 139 |
+
return "unknown"
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def detect_framework(code: str) -> str:
|
| 143 |
+
"""Detect framework from code content."""
|
| 144 |
+
if "'use client'" in code or "next/" in code or "Next" in code:
|
| 145 |
+
return "nextjs"
|
| 146 |
+
if "import React" in code or "from 'react'" in code:
|
| 147 |
+
return "react"
|
| 148 |
+
if "express" in code.lower():
|
| 149 |
+
return "express"
|
| 150 |
+
if "from flask" in code or "Flask(" in code:
|
| 151 |
+
return "flask"
|
| 152 |
+
if "from django" in code:
|
| 153 |
+
return "django"
|
| 154 |
+
if "import vue" in code.lower() or "defineComponent" in code:
|
| 155 |
+
return "vue"
|
| 156 |
+
return "none"
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# ββ Quality scoring ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 160 |
+
def score_quality(code: str, language: str) -> float:
|
| 161 |
+
"""Score code quality on a 1-10 scale using heuristics."""
|
| 162 |
+
score = 5.0
|
| 163 |
+
|
| 164 |
+
# Length bonus (not too short, not just boilerplate)
|
| 165 |
+
lines = code.strip().splitlines()
|
| 166 |
+
if len(lines) >= 10:
|
| 167 |
+
score += 0.5
|
| 168 |
+
if len(lines) >= 30:
|
| 169 |
+
score += 0.5
|
| 170 |
+
if len(lines) < 3:
|
| 171 |
+
score -= 2.0
|
| 172 |
+
|
| 173 |
+
# Has comments/docstrings
|
| 174 |
+
if "//" in code or "/*" in code or '"""' in code or "'''" in code or "#" in code:
|
| 175 |
+
score += 0.5
|
| 176 |
+
|
| 177 |
+
# Has type annotations (TypeScript/Python)
|
| 178 |
+
if language in ("typescript", "python"):
|
| 179 |
+
if ":" in code and ("interface " in code or "type " in code or "-> " in code):
|
| 180 |
+
score += 0.5
|
| 181 |
+
|
| 182 |
+
# Has proper imports
|
| 183 |
+
if "import " in code or "from " in code or "require(" in code:
|
| 184 |
+
score += 0.3
|
| 185 |
+
|
| 186 |
+
# Has error handling
|
| 187 |
+
if "try" in code or "catch" in code or "except" in code:
|
| 188 |
+
score += 0.3
|
| 189 |
+
|
| 190 |
+
# Has exports (module structure)
|
| 191 |
+
if "export " in code or "module.exports" in code:
|
| 192 |
+
score += 0.3
|
| 193 |
+
|
| 194 |
+
# Penalize very short or empty
|
| 195 |
+
if len(code.strip()) < 50:
|
| 196 |
+
score -= 1.0
|
| 197 |
+
|
| 198 |
+
# Penalize obvious low quality
|
| 199 |
+
if code.count("TODO") > 3 or code.count("FIXME") > 3:
|
| 200 |
+
score -= 0.5
|
| 201 |
+
if "console.log" in code and code.count("console.log") > 5:
|
| 202 |
+
score -= 0.3
|
| 203 |
+
|
| 204 |
+
# Has proper function/class structure
|
| 205 |
+
if "function " in code or "class " in code or "def " in code or "const " in code:
|
| 206 |
+
score += 0.3
|
| 207 |
+
|
| 208 |
+
# Tailwind/CSS usage
|
| 209 |
+
if "className" in code or "tailwind" in code.lower():
|
| 210 |
+
score += 0.3
|
| 211 |
+
|
| 212 |
+
return max(1.0, min(10.0, round(score, 1)))
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
# ββ Converter: wrap code in MINDI format βββββββββββββββββββββββββββββ
|
| 216 |
+
def wrap_mindi_assistant(
|
| 217 |
+
code: str,
|
| 218 |
+
language: str = "typescript",
|
| 219 |
+
filename: str = "",
|
| 220 |
+
thinking: str = "",
|
| 221 |
+
critique: str = "",
|
| 222 |
+
suggestions: str = "",
|
| 223 |
+
) -> str:
|
| 224 |
+
"""Wrap code in MINDI special token format."""
|
| 225 |
+
parts = []
|
| 226 |
+
|
| 227 |
+
# Thinking block
|
| 228 |
+
if thinking:
|
| 229 |
+
parts.append(f"<|think_start|>\n{thinking}\n<|think_end|>")
|
| 230 |
+
|
| 231 |
+
# File metadata
|
| 232 |
+
if filename:
|
| 233 |
+
framework = detect_framework(code)
|
| 234 |
+
parts.append(f"<|file_start|>\npath: {filename}\nlanguage: {language}\nframework: {framework}\n<|file_end|>")
|
| 235 |
+
|
| 236 |
+
# Code block
|
| 237 |
+
parts.append(f"<|code_start|>\n{code.strip()}\n<|code_end|>")
|
| 238 |
+
|
| 239 |
+
# Critique
|
| 240 |
+
if critique:
|
| 241 |
+
parts.append(f"<|critique_start|>\n{critique}\n<|critique_end|>")
|
| 242 |
+
|
| 243 |
+
# Suggestions
|
| 244 |
+
if suggestions:
|
| 245 |
+
parts.append(f"<|suggest_start|>\n{suggestions}\n<|suggest_end|>")
|
| 246 |
+
|
| 247 |
+
return "\n\n".join(parts)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def generate_thinking(user_request: str, language: str) -> str:
|
| 251 |
+
"""Generate a basic thinking block from the user request."""
|
| 252 |
+
verbs = ["analyze", "implement", "create", "design", "build"]
|
| 253 |
+
verb = random.choice(verbs)
|
| 254 |
+
return (
|
| 255 |
+
f"The user wants me to {verb} something. Let me break this down:\n"
|
| 256 |
+
f"1. Understand the requirements from the request\n"
|
| 257 |
+
f"2. Choose the right approach for {language}\n"
|
| 258 |
+
f"3. Write clean, production-ready code\n"
|
| 259 |
+
f"4. Review for best practices and accessibility"
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def generate_critique(language: str, code: str) -> str:
|
| 264 |
+
"""Generate a basic code critique."""
|
| 265 |
+
items = [
|
| 266 |
+
"β
Code structure: Well-organized with clear separation of concerns",
|
| 267 |
+
"β
Naming: Descriptive variable and function names",
|
| 268 |
+
]
|
| 269 |
+
if language in ("typescript", "javascript"):
|
| 270 |
+
items.append("β
Modern syntax: Uses ES6+ features appropriately")
|
| 271 |
+
if "className" in code:
|
| 272 |
+
items.append("β
Styling: Tailwind CSS classes used correctly")
|
| 273 |
+
items.append("β οΈ Consider adding error handling for edge cases")
|
| 274 |
+
items.append("β οΈ Could benefit from unit tests")
|
| 275 |
+
return "Code Review:\n" + "\n".join(f"- {item}" for item in items)
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def generate_suggestions() -> str:
|
| 279 |
+
"""Generate improvement suggestions."""
|
| 280 |
+
pool = [
|
| 281 |
+
"Add comprehensive error handling with try/catch",
|
| 282 |
+
"Implement loading and error states for better UX",
|
| 283 |
+
"Add TypeScript strict mode compliance",
|
| 284 |
+
"Write unit tests with Jest and Testing Library",
|
| 285 |
+
"Add JSDoc comments for public API",
|
| 286 |
+
"Consider extracting reusable hooks",
|
| 287 |
+
"Add proper aria attributes for accessibility",
|
| 288 |
+
"Implement responsive design breakpoints",
|
| 289 |
+
"Add performance optimization with useMemo/useCallback",
|
| 290 |
+
"Consider adding Storybook stories for documentation",
|
| 291 |
+
]
|
| 292 |
+
selected = random.sample(pool, min(4, len(pool)))
|
| 293 |
+
return "Suggested improvements:\n" + "\n".join(f"{i+1}. {s}" for i, s in enumerate(selected))
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
# ββ Source-specific converters ββββββββββββββββββββββββββββββββββββββββ
|
| 297 |
+
|
| 298 |
+
def convert_codealpaca(raw: dict, idx: int) -> Optional[dict]:
|
| 299 |
+
"""Convert CodeAlpaca example to MINDI format."""
|
| 300 |
+
instruction = raw.get("instruction", "").strip()
|
| 301 |
+
inp = raw.get("input", "").strip()
|
| 302 |
+
output = raw.get("output", "").strip()
|
| 303 |
+
|
| 304 |
+
if not instruction or not output:
|
| 305 |
+
return None
|
| 306 |
+
|
| 307 |
+
user_content = f"{instruction}\n{inp}".strip() if inp else instruction
|
| 308 |
+
language = detect_language(output)
|
| 309 |
+
quality = score_quality(output, language)
|
| 310 |
+
|
| 311 |
+
assistant_content = wrap_mindi_assistant(
|
| 312 |
+
code=output,
|
| 313 |
+
language=language,
|
| 314 |
+
thinking=generate_thinking(instruction, language),
|
| 315 |
+
critique=generate_critique(language, output),
|
| 316 |
+
suggestions=generate_suggestions(),
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
tokens = count_tokens(assistant_content)
|
| 320 |
+
|
| 321 |
+
return {
|
| 322 |
+
"id": f"mindi_{idx:06d}",
|
| 323 |
+
"type": "code_generation",
|
| 324 |
+
"source": "codealpaca",
|
| 325 |
+
"messages": [
|
| 326 |
+
{"role": "system", "content": MINDI_SYSTEM_PROMPT},
|
| 327 |
+
{"role": "user", "content": user_content},
|
| 328 |
+
{"role": "assistant", "content": assistant_content},
|
| 329 |
+
],
|
| 330 |
+
"metadata": {
|
| 331 |
+
"language": language,
|
| 332 |
+
"framework": detect_framework(output),
|
| 333 |
+
"has_vision": False,
|
| 334 |
+
"tokens": tokens,
|
| 335 |
+
"quality_score": quality,
|
| 336 |
+
},
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
def convert_codefeedback(raw: dict, idx: int) -> Optional[dict]:
|
| 341 |
+
"""Convert CodeFeedback example to MINDI format."""
|
| 342 |
+
query = raw.get("query", "").strip()
|
| 343 |
+
answer = raw.get("answer", "").strip()
|
| 344 |
+
|
| 345 |
+
if not query or not answer:
|
| 346 |
+
return None
|
| 347 |
+
|
| 348 |
+
# Extract code blocks from answer if present
|
| 349 |
+
code_blocks = re.findall(r"```[\w]*\n(.*?)```", answer, re.DOTALL)
|
| 350 |
+
code = "\n\n".join(code_blocks) if code_blocks else answer
|
| 351 |
+
|
| 352 |
+
language = detect_language(code)
|
| 353 |
+
quality = score_quality(code, language)
|
| 354 |
+
|
| 355 |
+
assistant_content = wrap_mindi_assistant(
|
| 356 |
+
code=code,
|
| 357 |
+
language=language,
|
| 358 |
+
thinking=generate_thinking(query, language),
|
| 359 |
+
critique=generate_critique(language, code),
|
| 360 |
+
suggestions=generate_suggestions(),
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
tokens = count_tokens(assistant_content)
|
| 364 |
+
|
| 365 |
+
return {
|
| 366 |
+
"id": f"mindi_{idx:06d}",
|
| 367 |
+
"type": "code_generation",
|
| 368 |
+
"source": "codefeedback",
|
| 369 |
+
"messages": [
|
| 370 |
+
{"role": "system", "content": MINDI_SYSTEM_PROMPT},
|
| 371 |
+
{"role": "user", "content": query},
|
| 372 |
+
{"role": "assistant", "content": assistant_content},
|
| 373 |
+
],
|
| 374 |
+
"metadata": {
|
| 375 |
+
"language": language,
|
| 376 |
+
"framework": detect_framework(code),
|
| 377 |
+
"has_vision": False,
|
| 378 |
+
"tokens": tokens,
|
| 379 |
+
"quality_score": quality,
|
| 380 |
+
},
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
def convert_starcoderdata(raw: dict, idx: int) -> Optional[dict]:
|
| 385 |
+
"""Convert StarCoder raw code to MINDI instruction format."""
|
| 386 |
+
content = raw.get("content", "").strip()
|
| 387 |
+
if not content or len(content) < 50:
|
| 388 |
+
return None
|
| 389 |
+
|
| 390 |
+
# Extract metadata
|
| 391 |
+
max_lines = raw.get("max_line_length", 0)
|
| 392 |
+
avg_line = raw.get("avg_line_length", 0)
|
| 393 |
+
|
| 394 |
+
language = detect_language(content)
|
| 395 |
+
quality = score_quality(content, language)
|
| 396 |
+
|
| 397 |
+
# Create a synthetic user request from the code
|
| 398 |
+
# Extract first comment or function/class name as context
|
| 399 |
+
first_lines = content[:500]
|
| 400 |
+
if "def " in first_lines:
|
| 401 |
+
match = re.search(r"def (\w+)", first_lines)
|
| 402 |
+
func_name = match.group(1) if match else "function"
|
| 403 |
+
user_request = f"Write a {language} function called `{func_name}` with proper implementation"
|
| 404 |
+
elif "class " in first_lines:
|
| 405 |
+
match = re.search(r"class (\w+)", first_lines)
|
| 406 |
+
class_name = match.group(1) if match else "Class"
|
| 407 |
+
user_request = f"Create a {language} class called `{class_name}` with full implementation"
|
| 408 |
+
elif "function " in first_lines or "const " in first_lines:
|
| 409 |
+
match = re.search(r"(?:function|const)\s+(\w+)", first_lines)
|
| 410 |
+
name = match.group(1) if match else "component"
|
| 411 |
+
user_request = f"Implement `{name}` in {language} with clean, modern code"
|
| 412 |
+
elif "export " in first_lines:
|
| 413 |
+
match = re.search(r"export\s+(?:default\s+)?(?:function|class|const)\s+(\w+)", first_lines)
|
| 414 |
+
name = match.group(1) if match else "module"
|
| 415 |
+
user_request = f"Build an exported {language} module `{name}`"
|
| 416 |
+
else:
|
| 417 |
+
user_request = f"Write this {language} code with best practices"
|
| 418 |
+
|
| 419 |
+
# Detect filename from content hints
|
| 420 |
+
filename = ""
|
| 421 |
+
if language == "python":
|
| 422 |
+
filename = "main.py"
|
| 423 |
+
elif language == "typescript":
|
| 424 |
+
filename = "index.tsx"
|
| 425 |
+
elif language == "javascript":
|
| 426 |
+
filename = "index.js"
|
| 427 |
+
|
| 428 |
+
assistant_content = wrap_mindi_assistant(
|
| 429 |
+
code=content,
|
| 430 |
+
language=language,
|
| 431 |
+
filename=filename,
|
| 432 |
+
thinking=generate_thinking(user_request, language),
|
| 433 |
+
critique=generate_critique(language, content),
|
| 434 |
+
suggestions=generate_suggestions(),
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
tokens = count_tokens(assistant_content)
|
| 438 |
+
|
| 439 |
+
return {
|
| 440 |
+
"id": f"mindi_{idx:06d}",
|
| 441 |
+
"type": "code_generation",
|
| 442 |
+
"source": "starcoderdata",
|
| 443 |
+
"messages": [
|
| 444 |
+
{"role": "system", "content": MINDI_SYSTEM_PROMPT},
|
| 445 |
+
{"role": "user", "content": user_request},
|
| 446 |
+
{"role": "assistant", "content": assistant_content},
|
| 447 |
+
],
|
| 448 |
+
"metadata": {
|
| 449 |
+
"language": language,
|
| 450 |
+
"framework": detect_framework(content),
|
| 451 |
+
"has_vision": False,
|
| 452 |
+
"tokens": tokens,
|
| 453 |
+
"quality_score": quality,
|
| 454 |
+
},
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
def convert_websight(raw: dict, idx: int) -> Optional[dict]:
|
| 459 |
+
"""Convert WebSight HTML+screenshot to MINDI format."""
|
| 460 |
+
html = raw.get("text", "").strip()
|
| 461 |
+
if not html:
|
| 462 |
+
return None
|
| 463 |
+
|
| 464 |
+
# WebSight has HTML β we keep it as-is (conversion to JSX is a training objective)
|
| 465 |
+
language = "html"
|
| 466 |
+
quality = score_quality(html, language)
|
| 467 |
+
has_image = "image" in raw or "screenshot" in raw
|
| 468 |
+
|
| 469 |
+
user_request = "Convert this webpage design into a modern Next.js 14 component with Tailwind CSS"
|
| 470 |
+
|
| 471 |
+
thinking = (
|
| 472 |
+
"The user wants me to convert a web design to Next.js. I need to:\n"
|
| 473 |
+
"1. Analyze the HTML structure and visual layout\n"
|
| 474 |
+
"2. Convert HTML elements to React JSX syntax\n"
|
| 475 |
+
"3. Replace CSS classes with Tailwind CSS utilities\n"
|
| 476 |
+
"4. Add TypeScript types and proper component structure\n"
|
| 477 |
+
"5. Ensure responsive design and accessibility"
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
assistant_content = wrap_mindi_assistant(
|
| 481 |
+
code=html,
|
| 482 |
+
language="typescript",
|
| 483 |
+
filename="src/components/ConvertedPage.tsx",
|
| 484 |
+
thinking=thinking,
|
| 485 |
+
critique=generate_critique("typescript", html),
|
| 486 |
+
suggestions=generate_suggestions(),
|
| 487 |
+
)
|
| 488 |
+
|
| 489 |
+
tokens = count_tokens(assistant_content)
|
| 490 |
+
|
| 491 |
+
return {
|
| 492 |
+
"id": f"mindi_{idx:06d}",
|
| 493 |
+
"type": "vision_code",
|
| 494 |
+
"source": "websight",
|
| 495 |
+
"messages": [
|
| 496 |
+
{"role": "system", "content": MINDI_SYSTEM_PROMPT},
|
| 497 |
+
{"role": "user", "content": user_request},
|
| 498 |
+
{"role": "assistant", "content": assistant_content},
|
| 499 |
+
],
|
| 500 |
+
"metadata": {
|
| 501 |
+
"language": "typescript",
|
| 502 |
+
"framework": "nextjs",
|
| 503 |
+
"has_vision": has_image,
|
| 504 |
+
"tokens": tokens,
|
| 505 |
+
"quality_score": quality,
|
| 506 |
+
},
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
def convert_synthetic(raw: dict, idx: int) -> Optional[dict]:
|
| 511 |
+
"""Convert synthetic data (already in near-MINDI format) to final format."""
|
| 512 |
+
user_content = raw.get("user", "").strip()
|
| 513 |
+
assistant_content = raw.get("assistant", "").strip()
|
| 514 |
+
source = raw.get("source", "synthetic")
|
| 515 |
+
|
| 516 |
+
if not user_content or not assistant_content:
|
| 517 |
+
return None
|
| 518 |
+
|
| 519 |
+
tokens = count_tokens(assistant_content)
|
| 520 |
+
language = raw.get("language", "typescript")
|
| 521 |
+
|
| 522 |
+
return {
|
| 523 |
+
"id": f"mindi_{idx:06d}",
|
| 524 |
+
"type": "code_generation" if "search" not in source else "search",
|
| 525 |
+
"source": source,
|
| 526 |
+
"messages": [
|
| 527 |
+
{"role": "system", "content": MINDI_SYSTEM_PROMPT},
|
| 528 |
+
{"role": "user", "content": user_content},
|
| 529 |
+
{"role": "assistant", "content": assistant_content},
|
| 530 |
+
],
|
| 531 |
+
"metadata": {
|
| 532 |
+
"language": language,
|
| 533 |
+
"framework": raw.get("framework", "nextjs"),
|
| 534 |
+
"has_vision": False,
|
| 535 |
+
"tokens": tokens,
|
| 536 |
+
"quality_score": score_quality(assistant_content, language),
|
| 537 |
+
},
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
def convert_evol_code(raw: dict, idx: int) -> Optional[dict]:
|
| 542 |
+
"""Convert EvolInstruct-Code example to MINDI format."""
|
| 543 |
+
instruction = raw.get("instruction", "").strip()
|
| 544 |
+
output = raw.get("output", "").strip()
|
| 545 |
+
|
| 546 |
+
if not instruction or not output:
|
| 547 |
+
return None
|
| 548 |
+
|
| 549 |
+
code_blocks = re.findall(r"```[\w]*\n(.*?)```", output, re.DOTALL)
|
| 550 |
+
code = "\n\n".join(code_blocks) if code_blocks else output
|
| 551 |
+
|
| 552 |
+
language = detect_language(code)
|
| 553 |
+
quality = score_quality(code, language)
|
| 554 |
+
|
| 555 |
+
assistant_content = wrap_mindi_assistant(
|
| 556 |
+
code=code,
|
| 557 |
+
language=language,
|
| 558 |
+
thinking=generate_thinking(instruction, language),
|
| 559 |
+
critique=generate_critique(language, code),
|
| 560 |
+
suggestions=generate_suggestions(),
|
| 561 |
+
)
|
| 562 |
+
|
| 563 |
+
tokens = count_tokens(assistant_content)
|
| 564 |
+
|
| 565 |
+
return {
|
| 566 |
+
"id": f"mindi_{idx:06d}",
|
| 567 |
+
"type": "code_generation",
|
| 568 |
+
"source": "evol_code",
|
| 569 |
+
"messages": [
|
| 570 |
+
{"role": "system", "content": MINDI_SYSTEM_PROMPT},
|
| 571 |
+
{"role": "user", "content": instruction},
|
| 572 |
+
{"role": "assistant", "content": assistant_content},
|
| 573 |
+
],
|
| 574 |
+
"metadata": {
|
| 575 |
+
"language": language,
|
| 576 |
+
"framework": detect_framework(code),
|
| 577 |
+
"has_vision": False,
|
| 578 |
+
"tokens": tokens,
|
| 579 |
+
"quality_score": quality,
|
| 580 |
+
},
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
def convert_magicoder(raw: dict, idx: int) -> Optional[dict]:
|
| 585 |
+
"""Convert Magicoder example to MINDI format."""
|
| 586 |
+
# Magicoder uses problem/solution or instruction/response
|
| 587 |
+
instruction = (raw.get("instruction", "") or raw.get("problem", "")).strip()
|
| 588 |
+
output = (raw.get("response", "") or raw.get("solution", "")).strip()
|
| 589 |
+
|
| 590 |
+
if not instruction or not output:
|
| 591 |
+
return None
|
| 592 |
+
|
| 593 |
+
code_blocks = re.findall(r"```[\w]*\n(.*?)```", output, re.DOTALL)
|
| 594 |
+
code = "\n\n".join(code_blocks) if code_blocks else output
|
| 595 |
+
|
| 596 |
+
language = detect_language(code)
|
| 597 |
+
quality = score_quality(code, language)
|
| 598 |
+
|
| 599 |
+
assistant_content = wrap_mindi_assistant(
|
| 600 |
+
code=code,
|
| 601 |
+
language=language,
|
| 602 |
+
thinking=generate_thinking(instruction, language),
|
| 603 |
+
critique=generate_critique(language, code),
|
| 604 |
+
suggestions=generate_suggestions(),
|
| 605 |
+
)
|
| 606 |
+
|
| 607 |
+
tokens = count_tokens(assistant_content)
|
| 608 |
+
|
| 609 |
+
return {
|
| 610 |
+
"id": f"mindi_{idx:06d}",
|
| 611 |
+
"type": "code_generation",
|
| 612 |
+
"source": "magicoder",
|
| 613 |
+
"messages": [
|
| 614 |
+
{"role": "system", "content": MINDI_SYSTEM_PROMPT},
|
| 615 |
+
{"role": "user", "content": instruction},
|
| 616 |
+
{"role": "assistant", "content": assistant_content},
|
| 617 |
+
],
|
| 618 |
+
"metadata": {
|
| 619 |
+
"language": language,
|
| 620 |
+
"framework": detect_framework(code),
|
| 621 |
+
"has_vision": False,
|
| 622 |
+
"tokens": tokens,
|
| 623 |
+
"quality_score": quality,
|
| 624 |
+
},
|
| 625 |
+
}
|
| 626 |
+
|
| 627 |
+
|
| 628 |
+
# ββ Source registry βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 629 |
+
SOURCE_CONVERTERS = {
|
| 630 |
+
"codealpaca": ("codealpaca.jsonl", convert_codealpaca),
|
| 631 |
+
"codefeedback": ("codefeedback.jsonl", convert_codefeedback),
|
| 632 |
+
"starcoder_python": ("starcoder_python.jsonl", convert_starcoderdata),
|
| 633 |
+
"starcoder_javascript": ("starcoder_javascript.jsonl", convert_starcoderdata),
|
| 634 |
+
"starcoder_typescript": ("starcoder_typescript.jsonl", convert_starcoderdata),
|
| 635 |
+
"starcoder_css": ("starcoder_css.jsonl", convert_starcoderdata),
|
| 636 |
+
"starcoder_html": ("starcoder_html.jsonl", convert_starcoderdata),
|
| 637 |
+
"evol_code": ("evol_code.jsonl", convert_evol_code),
|
| 638 |
+
"magicoder": ("magicoder.jsonl", convert_magicoder),
|
| 639 |
+
"websight": ("websight.jsonl", convert_websight),
|
| 640 |
+
"synthetic_nextjs": ("synthetic_nextjs.jsonl", convert_synthetic),
|
| 641 |
+
"search_examples": ("search_examples.jsonl", convert_synthetic),
|
| 642 |
+
"sandbox_examples": ("sandbox_examples.jsonl", convert_synthetic),
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
OUTPUT_FILE = DATA_PROCESSED / "mindi_all.jsonl"
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
# ββ Main processing pipeline βββββββββββββββββββββββββββββββββββββββββ
|
| 649 |
+
def process_source(
|
| 650 |
+
source_name: str,
|
| 651 |
+
global_idx: int,
|
| 652 |
+
progress: Progress,
|
| 653 |
+
dry_run: bool = False,
|
| 654 |
+
) -> tuple[int, int, int]:
|
| 655 |
+
"""Process one source, return (converted, skipped, global_idx)."""
|
| 656 |
+
if source_name not in SOURCE_CONVERTERS:
|
| 657 |
+
log.error(f"Unknown source: {source_name}")
|
| 658 |
+
return 0, 0, global_idx
|
| 659 |
+
|
| 660 |
+
filename, converter = SOURCE_CONVERTERS[source_name]
|
| 661 |
+
input_path = DATA_RAW / filename
|
| 662 |
+
|
| 663 |
+
if not input_path.exists():
|
| 664 |
+
log.warning(f"βοΈ Skipping {source_name}: {input_path} not found (download first)")
|
| 665 |
+
return 0, 0, global_idx
|
| 666 |
+
|
| 667 |
+
# Count lines for progress
|
| 668 |
+
total_lines = sum(1 for _ in open(input_path, encoding="utf-8"))
|
| 669 |
+
task = progress.add_task(f"[cyan]{source_name}", total=total_lines)
|
| 670 |
+
|
| 671 |
+
converted = 0
|
| 672 |
+
skipped = 0
|
| 673 |
+
output_handle = None
|
| 674 |
+
|
| 675 |
+
if not dry_run:
|
| 676 |
+
# Append mode so we can process sources incrementally
|
| 677 |
+
output_handle = open(OUTPUT_FILE, "a", encoding="utf-8")
|
| 678 |
+
|
| 679 |
+
try:
|
| 680 |
+
with open(input_path, "r", encoding="utf-8") as f:
|
| 681 |
+
for line_num, line in enumerate(f):
|
| 682 |
+
line = line.strip()
|
| 683 |
+
if not line:
|
| 684 |
+
progress.update(task, advance=1)
|
| 685 |
+
continue
|
| 686 |
+
|
| 687 |
+
try:
|
| 688 |
+
raw = json.loads(line)
|
| 689 |
+
except json.JSONDecodeError:
|
| 690 |
+
skipped += 1
|
| 691 |
+
progress.update(task, advance=1)
|
| 692 |
+
continue
|
| 693 |
+
|
| 694 |
+
result = converter(raw, global_idx)
|
| 695 |
+
|
| 696 |
+
if result is None:
|
| 697 |
+
skipped += 1
|
| 698 |
+
else:
|
| 699 |
+
if not dry_run and output_handle:
|
| 700 |
+
output_handle.write(json.dumps(result, ensure_ascii=False) + "\n")
|
| 701 |
+
converted += 1
|
| 702 |
+
global_idx += 1
|
| 703 |
+
|
| 704 |
+
progress.update(task, advance=1)
|
| 705 |
+
|
| 706 |
+
# Flush periodically
|
| 707 |
+
if not dry_run and output_handle and converted % 5000 == 0:
|
| 708 |
+
output_handle.flush()
|
| 709 |
+
|
| 710 |
+
finally:
|
| 711 |
+
if output_handle:
|
| 712 |
+
output_handle.close()
|
| 713 |
+
|
| 714 |
+
log.info(f"{'[DRY RUN] ' if dry_run else ''}β
{source_name}: {converted:,} converted, {skipped:,} skipped")
|
| 715 |
+
return converted, skipped, global_idx
|
| 716 |
+
|
| 717 |
+
|
| 718 |
+
def run_processing(
|
| 719 |
+
source: Optional[str] = None,
|
| 720 |
+
dry_run: bool = False,
|
| 721 |
+
) -> None:
|
| 722 |
+
"""Run the full processing pipeline."""
|
| 723 |
+
console.print(Panel.fit(
|
| 724 |
+
"[bold cyan]MINDI 1.5 Vision-Coder β MINDI Format Converter[/]\n"
|
| 725 |
+
"[dim]Day 2 Step 2: Convert raw datasets to MINDI training format[/]",
|
| 726 |
+
border_style="cyan",
|
| 727 |
+
))
|
| 728 |
+
|
| 729 |
+
# Determine sources to process
|
| 730 |
+
if source:
|
| 731 |
+
sources = [source]
|
| 732 |
+
else:
|
| 733 |
+
sources = list(SOURCE_CONVERTERS.keys())
|
| 734 |
+
|
| 735 |
+
# Show available files
|
| 736 |
+
available_table = Table(title="π Raw Data Files")
|
| 737 |
+
available_table.add_column("Source", style="cyan")
|
| 738 |
+
available_table.add_column("File")
|
| 739 |
+
available_table.add_column("Exists")
|
| 740 |
+
available_table.add_column("Size")
|
| 741 |
+
|
| 742 |
+
for src in sources:
|
| 743 |
+
fname, _ = SOURCE_CONVERTERS[src]
|
| 744 |
+
fpath = DATA_RAW / fname
|
| 745 |
+
exists = fpath.exists()
|
| 746 |
+
size = f"{fpath.stat().st_size / (1024*1024):.1f} MB" if exists else "β"
|
| 747 |
+
available_table.add_row(src, fname, "β
" if exists else "β", size)
|
| 748 |
+
|
| 749 |
+
console.print(available_table)
|
| 750 |
+
|
| 751 |
+
# Count existing examples in output file to resume from correct ID
|
| 752 |
+
existing_count = 0
|
| 753 |
+
if OUTPUT_FILE.exists():
|
| 754 |
+
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
|
| 755 |
+
existing_count = sum(1 for _ in f)
|
| 756 |
+
log.info(f"π Existing mindi_all.jsonl has {existing_count:,} examples β appending new data")
|
| 757 |
+
|
| 758 |
+
# Process each source
|
| 759 |
+
total_converted = 0
|
| 760 |
+
total_skipped = 0
|
| 761 |
+
global_idx = existing_count
|
| 762 |
+
|
| 763 |
+
with Progress(
|
| 764 |
+
SpinnerColumn(),
|
| 765 |
+
TextColumn("[progress.description]{task.description}"),
|
| 766 |
+
BarColumn(),
|
| 767 |
+
MofNCompleteColumn(),
|
| 768 |
+
TimeElapsedColumn(),
|
| 769 |
+
TimeRemainingColumn(),
|
| 770 |
+
console=console,
|
| 771 |
+
refresh_per_second=2,
|
| 772 |
+
) as progress:
|
| 773 |
+
for src in sources:
|
| 774 |
+
converted, skipped, global_idx = process_source(
|
| 775 |
+
src, global_idx, progress, dry_run=dry_run
|
| 776 |
+
)
|
| 777 |
+
total_converted += converted
|
| 778 |
+
total_skipped += skipped
|
| 779 |
+
|
| 780 |
+
# Summary
|
| 781 |
+
console.print()
|
| 782 |
+
summary = Table(title="π Processing Summary")
|
| 783 |
+
summary.add_column("Metric", style="cyan")
|
| 784 |
+
summary.add_column("Value", justify="right", style="green")
|
| 785 |
+
|
| 786 |
+
summary.add_row("Previously existing", f"{existing_count:,}")
|
| 787 |
+
summary.add_row("Newly converted", f"{total_converted:,}")
|
| 788 |
+
summary.add_row("Total skipped", f"{total_skipped:,}")
|
| 789 |
+
grand_total = existing_count + total_converted
|
| 790 |
+
summary.add_row("[bold]Grand total[/]", f"[bold]{grand_total:,}[/]")
|
| 791 |
+
summary.add_row("Global ID range", f"mindi_000000 β mindi_{global_idx - 1:06d}")
|
| 792 |
+
|
| 793 |
+
if not dry_run and OUTPUT_FILE.exists():
|
| 794 |
+
size_mb = OUTPUT_FILE.stat().st_size / (1024 * 1024)
|
| 795 |
+
summary.add_row("Output file", str(OUTPUT_FILE.relative_to(PROJECT_ROOT)))
|
| 796 |
+
summary.add_row("Output size", f"{size_mb:.1f} MB")
|
| 797 |
+
|
| 798 |
+
console.print(summary)
|
| 799 |
+
|
| 800 |
+
if grand_total >= 500_000:
|
| 801 |
+
console.print("\n[bold green]π TARGET REACHED: 500K+ examples in MINDI format![/]")
|
| 802 |
+
elif grand_total > 0:
|
| 803 |
+
remaining = 500_000 - grand_total
|
| 804 |
+
console.print(f"\n[yellow]β³ {grand_total:,} total examples ({remaining:,} more needed for 500K target)[/]")
|
| 805 |
+
else:
|
| 806 |
+
console.print("\n[yellow]β οΈ No examples converted β download raw data first (scripts/download_datasets.py)[/]")
|
| 807 |
+
|
| 808 |
+
|
| 809 |
+
# ββ CLI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 810 |
+
def main() -> None:
|
| 811 |
+
parser = argparse.ArgumentParser(description="MINDI Format Converter")
|
| 812 |
+
parser.add_argument("--source", type=str, help="Process a specific source only")
|
| 813 |
+
parser.add_argument("--dry-run", action="store_true", help="Preview without writing output")
|
| 814 |
+
args = parser.parse_args()
|
| 815 |
+
|
| 816 |
+
run_processing(source=args.source, dry_run=args.dry_run)
|
| 817 |
+
|
| 818 |
+
|
| 819 |
+
if __name__ == "__main__":
|
| 820 |
+
main()
|