Day 2 COMPLETE: 1.48M examples processed, 6GB dataset, WebSight done

Files changed (2) hide show

scripts/download_datasets.py +891 -0
scripts/process_data.py +820 -0

scripts/download_datasets.py ADDED Viewed

	@@ -0,0 +1,891 @@

+"""
+MINDI 1.5 Vision-Coder — Day 2 Step 1: Dataset Download Pipeline
+Downloads 7 datasets (500K+ examples total) with:
+- Rich progress bars
+- Network retry with exponential backoff
+- Checkpoint/resume support
+- Disk space estimation
+- Logging to logs/download.log
+- Running total of examples
+Usage:
+    python scripts/download_datasets.py                    # Download all
+    python scripts/download_datasets.py --dataset websight # Download one
+    python scripts/download_datasets.py --stage 1          # Stage 1 only (small/fast)
+    python scripts/download_datasets.py --stage 2          # Stage 2 (starcoder)
+    python scripts/download_datasets.py --stage 3          # Stage 3 (websight)
+    python scripts/download_datasets.py --synthetic        # Synthetic only
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import logging
+import os
+import random
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Generator, Optional
+from rich.console import Console
+from rich.logging import RichHandler
+from rich.panel import Panel
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+from rich.table import Table
+# ── Project paths ─────────────────────────────────────────────────────
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+DATA_RAW = PROJECT_ROOT / "data" / "raw"
+LOGS_DIR = PROJECT_ROOT / "logs"
+CHECKPOINT_FILE = DATA_RAW / ".download_checkpoint.json"
+DATA_RAW.mkdir(parents=True, exist_ok=True)
+LOGS_DIR.mkdir(parents=True, exist_ok=True)
+# ── Logging ───────────────────────────────────────────────────────────
+console = Console()
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(message)s",
+    datefmt="[%X]",
+    handlers=[
+        RichHandler(console=console, rich_tracebacks=True, show_path=False),
+        logging.FileHandler(LOGS_DIR / "download.log", encoding="utf-8"),
+    ],
+)
+log = logging.getLogger("mindi.download")
+# ── Checkpoint manager ────────────────────────────────────────────────
+class CheckpointManager:
+    """Tracks which datasets are complete so downloads can resume."""
+    def __init__(self, path: Path = CHECKPOINT_FILE) -> None:
+        self.path = path
+        self.data: dict[str, Any] = self._load()
+    def _load(self) -> dict[str, Any]:
+        if self.path.exists():
+            return json.loads(self.path.read_text(encoding="utf-8"))
+        return {"completed": {}, "in_progress": {}}
+    def save(self) -> None:
+        self.path.write_text(json.dumps(self.data, indent=2), encoding="utf-8")
+    def is_complete(self, name: str) -> bool:
+        return name in self.data["completed"]
+    def mark_complete(self, name: str, count: int, size_mb: float) -> None:
+        self.data["completed"][name] = {
+            "count": count,
+            "size_mb": round(size_mb, 2),
+            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+        }
+        self.data["in_progress"].pop(name, None)
+        self.save()
+    def mark_in_progress(self, name: str, count: int) -> None:
+        self.data["in_progress"][name] = {"count": count}
+        self.save()
+    def get_resume_count(self, name: str) -> int:
+        return self.data.get("in_progress", {}).get(name, {}).get("count", 0)
+    def get_total_examples(self) -> int:
+        return sum(v["count"] for v in self.data["completed"].values())
+# ── Dataset definitions ───────────────────────────────────────────────
+@dataclass
+class DatasetConfig:
+    name: str
+    hf_name: str
+    hf_subset: Optional[str]
+    hf_split: str
+    target_count: int
+    output_file: str
+    stage: int
+    est_size_gb: float
+    description: str
+    languages: list[str] = field(default_factory=list)
+    is_synthetic: bool = False
+DATASETS: list[DatasetConfig] = [
+    # Stage 1 — Small/fast (5-10 min)
+    DatasetConfig(
+        name="codealpaca",
+        hf_name="sahil2801/CodeAlpaca-20k",
+        hf_subset=None,
+        hf_split="train",
+        target_count=20_000,
+        output_file="codealpaca.jsonl",
+        stage=1,
+        est_size_gb=0.05,
+        description="Code instruction-following pairs",
+    ),
+    DatasetConfig(
+        name="codefeedback",
+        hf_name="m-a-p/CodeFeedback-Filtered-Instruction",
+        hf_subset=None,
+        hf_split="train",
+        target_count=50_000,
+        output_file="codefeedback.jsonl",
+        stage=1,
+        est_size_gb=0.3,
+        description="Code with human feedback",
+    ),
+    # Stage 2 — Medium (1-2 hours)
+    DatasetConfig(
+        name="starcoder_python",
+        hf_name="bigcode/starcoderdata",
+        hf_subset="python",
+        hf_split="train",
+        target_count=100_000,
+        output_file="starcoderdata.jsonl",
+        stage=2,
+        est_size_gb=2.0,
+        description="StarCoder Python code",
+        languages=["python"],
+    ),
+    DatasetConfig(
+        name="starcoder_javascript",
+        hf_name="bigcode/starcoderdata",
+        hf_subset="javascript",
+        hf_split="train",
+        target_count=100_000,
+        output_file="starcoderdata.jsonl",  # appends to same file
+        stage=2,
+        est_size_gb=2.0,
+        description="StarCoder JavaScript code",
+        languages=["javascript"],
+    ),
+    DatasetConfig(
+        name="starcoder_typescript",
+        hf_name="bigcode/starcoderdata",
+        hf_subset="typescript",
+        hf_split="train",
+        target_count=50_000,
+        output_file="starcoderdata.jsonl",  # appends to same file
+        stage=2,
+        est_size_gb=1.0,
+        description="StarCoder TypeScript code",
+        languages=["typescript"],
+    ),
+    # Stage 3 — Large (overnight)
+    DatasetConfig(
+        name="websight",
+        hf_name="HuggingFaceM4/WebSight",
+        hf_subset="v0.2",
+        hf_split="train",
+        target_count=200_000,
+        output_file="websight.jsonl",
+        stage=3,
+        est_size_gb=8.0,
+        description="Screenshots + HTML code pairs",
+    ),
+    # Synthetic — No download needed
+    DatasetConfig(
+        name="synthetic_nextjs",
+        hf_name="",
+        hf_subset=None,
+        hf_split="",
+        target_count=30_000,
+        output_file="synthetic_nextjs.jsonl",
+        stage=0,
+        est_size_gb=0.2,
+        description="Synthetic Next.js components with MINDI format",
+        is_synthetic=True,
+    ),
+    DatasetConfig(
+        name="search_examples",
+        hf_name="",
+        hf_subset=None,
+        hf_split="",
+        target_count=5_000,
+        output_file="search_examples.jsonl",
+        stage=0,
+        est_size_gb=0.03,
+        description="MINDI search usage examples",
+        is_synthetic=True,
+    ),
+    DatasetConfig(
+        name="sandbox_examples",
+        hf_name="",
+        hf_subset=None,
+        hf_split="",
+        target_count=3_000,
+        output_file="sandbox_examples.jsonl",
+        stage=0,
+        est_size_gb=0.02,
+        description="MINDI sandbox error-fix examples",
+        is_synthetic=True,
+    ),
+]
+# ── Retry helper ──────────────────────────────────────────────────────
+def retry_with_backoff(fn, max_retries: int = 5, base_delay: float = 2.0):
+    """Call fn() with exponential backoff on failure."""
+    for attempt in range(max_retries):
+        try:
+            return fn()
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise
+            delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
+            log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s...")
+            time.sleep(delay)
+# ── HuggingFace download ─────────────────────────────────────────────
+def download_hf_dataset(
+    config: DatasetConfig,
+    checkpoint: CheckpointManager,
+    progress: Progress,
+) -> int:
+    """Download a HuggingFace dataset with streaming and save as JSONL."""
+    from datasets import load_dataset
+    output_path = DATA_RAW / config.output_file
+    resume_count = checkpoint.get_resume_count(config.name)
+    # For starcoder subsets that share an output file, use append mode
+    # but only if this specific subset hasn't been completed
+    is_append = config.output_file == "starcoderdata.jsonl" and output_path.exists()
+    mode = "a" if is_append else "w"
+    if not is_append and resume_count == 0:
+        mode = "w"
+    elif resume_count > 0:
+        mode = "a"
+        log.info(f"Resuming {config.name} from example {resume_count:,}")
+    task = progress.add_task(
+        f"[cyan]{config.name}",
+        total=config.target_count,
+        completed=resume_count,
+    )
+    log.info(f"Loading {config.hf_name} (subset={config.hf_subset}, split={config.hf_split}) streaming=True")
+    def _load():
+        kwargs = {
+            "path": config.hf_name,
+            "split": config.hf_split,
+            "streaming": True,
+            "trust_remote_code": True,
+        }
+        if config.hf_subset:
+            kwargs["name"] = config.hf_subset
+        return load_dataset(**kwargs)
+    ds = retry_with_backoff(_load)
+    count = 0
+    skipped = 0
+    with open(output_path, mode, encoding="utf-8") as f:
+        for example in ds:
+            if count < resume_count:
+                count += 1
+                continue
+            # Write raw example as JSONL
+            try:
+                line = json.dumps(example, ensure_ascii=False, default=str)
+                f.write(line + "\n")
+            except (TypeError, ValueError) as e:
+                skipped += 1
+                continue
+            count += 1
+            progress.update(task, completed=count)
+            # Periodic checkpoint every 5000 examples
+            if count % 5000 == 0:
+                checkpoint.mark_in_progress(config.name, count)
+                f.flush()
+            if count >= config.target_count:
+                break
+    size_mb = output_path.stat().st_size / (1024 * 1024)
+    log.info(f"✅ {config.name}: {count:,} examples, {size_mb:.1f} MB (skipped {skipped})")
+    progress.update(task, completed=count)
+    return count
+# ── Synthetic generators ──────────────────────────────────────────────
+# Component templates for synthetic Next.js data
+COMPONENT_TYPES = [
+    "Navbar", "Hero", "Footer", "Sidebar", "Card", "Modal", "Dropdown",
+    "Accordion", "Tabs", "Carousel", "Pagination", "Breadcrumb", "Alert",
+    "Toast", "Badge", "Avatar", "Tooltip", "Popover", "Progress", "Spinner",
+    "Skeleton", "Table", "Form", "Input", "Select", "Checkbox", "Radio",
+    "Switch", "Slider", "DatePicker", "FileUpload", "SearchBar", "CommandPalette",
+    "DataTable", "Chart", "Calendar", "Timeline", "Stepper", "Rating",
+    "PricingCard", "TestimonialCard", "FeatureGrid", "StatsSection",
+    "CTASection", "Newsletter", "LoginForm", "SignupForm", "ProfileCard",
+    "DashboardLayout", "SettingsPanel", "NotificationList", "ChatBubble",
+]
+TAILWIND_COLORS = [
+    "slate", "gray", "zinc", "neutral", "stone", "red", "orange", "amber",
+    "yellow", "lime", "green", "emerald", "teal", "cyan", "sky", "blue",
+    "indigo", "violet", "purple", "fuchsia", "pink", "rose",
+]
+DESIGN_PATTERNS = [
+    "responsive grid layout", "flexbox centering", "gradient background",
+    "glassmorphism effect", "dark mode support", "animated entrance",
+    "hover transitions", "skeleton loading state", "error boundary",
+    "lazy loading", "infinite scroll", "drag and drop", "keyboard navigation",
+    "focus management", "scroll animations", "parallax effect",
+]
+USER_REQUESTS = [
+    "Build me a {component} component with {pattern}",
+    "Create a modern {component} using Tailwind CSS with {color} theme",
+    "I need a {component} that supports dark mode and is fully accessible",
+    "Design a {component} with smooth animations and {pattern}",
+    "Make a responsive {component} component for a SaaS dashboard",
+    "Build a {component} with TypeScript and proper prop types",
+    "Create a reusable {component} with {pattern} for a landing page",
+    "I want a {component} that looks like the latest {color} design trend",
+    "Generate a production-ready {component} with {pattern}",
+    "Build a {component} component with Framer Motion animations",
+]
+CRITIQUE_TEMPLATES = [
+    "Visual Analysis:\n- ✅ Layout: Clean {pattern} implementation\n- ✅ Typography: Proper hierarchy with {color} accent colors\n- ⚠️ Accessibility: Consider adding aria-labels to interactive elements\n- ✅ Responsiveness: Works across breakpoints",
+    "Design Review:\n- ✅ Color scheme: {color} palette creates good visual harmony\n- ✅ Spacing: Consistent padding and margins\n- ⚠️ Touch targets: Buttons should be at least 44px for mobile\n- ✅ Visual hierarchy: Clear flow from header to content",
+    "UI/UX Assessment:\n- ✅ {pattern}: Well implemented with smooth transitions\n- ✅ Contrast: Text is readable against background\n- ⚠️ Loading state: Consider adding skeleton screens\n- ✅ Component structure: Clean separation of concerns",
+]
+SUGGEST_TEMPLATES = [
+    "Improvements for next iteration:\n1. Add aria-label attributes for screen readers\n2. Implement keyboard navigation (Tab, Enter, Escape)\n3. Add loading skeleton state\n4. Consider adding subtle micro-interactions on hover",
+    "Suggestions:\n1. Add error boundary wrapper for production safety\n2. Implement responsive breakpoints for sm/md/lg/xl\n3. Add unit tests with @testing-library/react\n4. Consider extracting reusable hooks for state logic",
+    "Next steps:\n1. Add dark mode toggle using next-themes\n2. Optimize images with next/image component\n3. Add Storybook stories for documentation\n4. Implement proper TypeScript discriminated unions for variants",
+]
+def _generate_code_block(component: str, color: str) -> str:
+    """Generate a realistic Next.js component code block."""
+    props_name = f"{component}Props"
+    variants = ["default", "primary", "secondary", "outline", "ghost"]
+    variant = random.choice(variants)
+    code = f"""'use client';
+import {{ useState }} from 'react';
+import {{ cn }} from '@/lib/utils';
+interface {props_name} {{
+  variant?: '{variant}' | 'default';
+  className?: string;
+  children?: React.ReactNode;
+}}
+export default function {component}({{ variant = 'default', className, children }}: {props_name}) {{
+  const [isActive, setIsActive] = useState(false);
+  return (
+    <div
+      className={{cn(
+        'rounded-lg border p-4 transition-all duration-200',
+        variant === '{variant}' && 'bg-{color}-50 border-{color}-200 text-{color}-900',
+        variant === 'default' && 'bg-white border-gray-200 text-gray-900',
+        isActive && 'ring-2 ring-{color}-500 shadow-lg',
+        className
+      )}}
+      onClick={{() => setIsActive(!isActive)}}
+      role="button"
+      tabIndex={{0}}
+      onKeyDown={{(e) => e.key === 'Enter' && setIsActive(!isActive)}}
+    >
+      <div className="flex items-center justify-between">
+        <h3 className="text-lg font-semibold">{component}</h3>
+        <span className="text-sm text-{color}-600">{{variant}}</span>
+      </div>
+      <div className="mt-2 text-sm text-gray-600">
+        {{children}}
+      </div>
+    </div>
+  );
+}}"""
+    return code
+def generate_synthetic_nextjs(count: int, progress: Progress) -> Generator[dict, None, None]:
+    """Generate synthetic Next.js training examples in MINDI format."""
+    task = progress.add_task("[magenta]synthetic_nextjs", total=count)
+    for i in range(count):
+        component = random.choice(COMPONENT_TYPES)
+        color = random.choice(TAILWIND_COLORS)
+        pattern = random.choice(DESIGN_PATTERNS)
+        request_template = random.choice(USER_REQUESTS)
+        user_request = request_template.format(
+            component=component, color=color, pattern=pattern
+        )
+        code = _generate_code_block(component, color)
+        filename = f"src/components/{component}.tsx"
+        thinking = (
+            f"The user wants a {component} component. I need to:\n"
+            f"1. Create a TypeScript component with proper prop types\n"
+            f"2. Use Tailwind CSS with {color} color scheme\n"
+            f"3. Implement {pattern}\n"
+            f"4. Ensure accessibility with ARIA attributes\n"
+            f"5. Add keyboard navigation support"
+        )
+        critique = random.choice(CRITIQUE_TEMPLATES).format(
+            pattern=pattern, color=color
+        )
+        suggestions = random.choice(SUGGEST_TEMPLATES)
+        assistant_content = (
+            f"<|think_start|>\n{thinking}\n<|think_end|>\n\n"
+            f"<|file_start|>\npath: {filename}\nlanguage: typescript\nframework: next.js 14\n<|file_end|>\n\n"
+            f"<|code_start|>\n{code}\n<|code_end|>\n\n"
+            f"<|critique_start|>\n{critique}\n<|critique_end|>\n\n"
+            f"<|suggest_start|>\n{suggestions}\n<|suggest_end|>"
+        )
+        yield {
+            "id": f"synthetic_{i:06d}",
+            "source": "synthetic_nextjs",
+            "user": user_request,
+            "assistant": assistant_content,
+            "component": component,
+            "language": "typescript",
+            "framework": "nextjs",
+        }
+        progress.update(task, completed=i + 1)
+def generate_search_examples(count: int, progress: Progress) -> Generator[dict, None, None]:
+    """Generate synthetic search usage examples."""
+    task = progress.add_task("[yellow]search_examples", total=count)
+    search_scenarios = [
+        ("How to implement dark mode in Next.js 14?", "next.js 14 dark mode implementation next-themes"),
+        ("Best practices for React form validation", "react form validation zod react-hook-form 2025"),
+        ("How to set up authentication in Next.js?", "next.js 14 authentication NextAuth.js credentials"),
+        ("Tailwind CSS animation examples", "tailwind css animation keyframes framer-motion"),
+        ("How to optimize images in Next.js?", "next.js image optimization next/image blur placeholder"),
+        ("React server components best practices", "react server components RSC data fetching patterns"),
+        ("How to deploy Next.js to Vercel?", "next.js 14 vercel deployment environment variables"),
+        ("TypeScript utility types for React", "typescript react utility types ComponentProps PropsWithChildren"),
+        ("How to use Zustand for state management?", "zustand state management react next.js middleware"),
+        ("CSS Grid vs Flexbox for layouts", "css grid flexbox responsive layout patterns 2025"),
+        ("How to implement infinite scroll?", "react infinite scroll intersection observer tanstack query"),
+        ("Next.js API routes best practices", "next.js 14 route handlers API validation zod"),
+        ("How to add SEO to Next.js?", "next.js 14 metadata SEO generateMetadata open graph"),
+        ("React testing best practices", "react testing library jest vitest component testing"),
+        ("How to use Prisma with Next.js?", "prisma next.js 14 database postgresql schema"),
+    ]
+    packages_db = [
+        ("framer-motion", "Production-ready motion library for React", "npm i framer-motion"),
+        ("next-themes", "Dark mode for Next.js apps", "npm i next-themes"),
+        ("zustand", "Small, fast state management", "npm i zustand"),
+        ("@tanstack/react-query", "Powerful data synchronization", "npm i @tanstack/react-query"),
+        ("react-hook-form", "Performant forms with validation", "npm i react-hook-form"),
+        ("zod", "TypeScript-first schema validation", "npm i zod"),
+        ("tailwind-merge", "Merge Tailwind classes without conflicts", "npm i tailwind-merge"),
+        ("clsx", "Tiny utility for constructing className strings", "npm i clsx"),
+        ("lucide-react", "Beautiful SVG icons for React", "npm i lucide-react"),
+        ("@radix-ui/react-dialog", "Accessible dialog component", "npm i @radix-ui/react-dialog"),
+    ]
+    for i in range(count):
+        scenario = search_scenarios[i % len(search_scenarios)]
+        pkg = packages_db[i % len(packages_db)]
+        user_q = scenario[0]
+        search_query = scenario[1]
+        assistant_content = (
+            f"<|think_start|>\nThe user is asking about {user_q.lower().rstrip('?')}. "
+            f"Let me search for the latest best practices.\n<|think_end|>\n\n"
+            f"<|search_start|>\nquery: \"{search_query}\"\n"
+            f"results: [\n"
+            f"  {{\"title\": \"Official Documentation\", \"url\": \"https://docs.example.com\", \"snippet\": \"Comprehensive guide...\"}},\n"
+            f"  {{\"title\": \"Best Practices 2025\", \"url\": \"https://blog.example.com\", \"snippet\": \"Updated approach...\"}}\n"
+            f"]\n<|search_end|>\n\n"
+            f"Based on my research, here's the recommended approach:\n\n"
+            f"First, install the required package:\n```bash\n{pkg[2]}\n```\n\n"
+            f"**{pkg[0]}** — {pkg[1]}\n\n"
+            f"<|code_start|>\n"
+            f"// Example usage of {pkg[0]}\n"
+            f"import {{ /* relevant imports */ }} from '{pkg[0]}';\n\n"
+            f"export default function Example() {{\n"
+            f"  // Implementation based on search results\n"
+            f"  return <div>Example using {pkg[0]}</div>;\n"
+            f"}}\n"
+            f"<|code_end|>"
+        )
+        yield {
+            "id": f"search_{i:06d}",
+            "source": "search_examples",
+            "user": user_q,
+            "assistant": assistant_content,
+            "search_query": search_query,
+        }
+        progress.update(task, completed=i + 1)
+def generate_sandbox_examples(count: int, progress: Progress) -> Generator[dict, None, None]:
+    """Generate synthetic sandbox error-fix examples."""
+    task = progress.add_task("[red]sandbox_examples", total=count)
+    error_scenarios = [
+        {
+            "error": "TypeError: Cannot read properties of undefined (reading 'map')",
+            "cause": "Data array is undefined on initial render before API response",
+            "fix": "Add optional chaining and fallback: data?.items?.map(...) ?? []",
+            "file": "src/components/DataList.tsx",
+        },
+        {
+            "error": "Error: Hydration failed because the initial UI does not match what was rendered on the server",
+            "cause": "Using browser-only APIs (window, localStorage) during server render",
+            "fix": "Wrap in useEffect or use dynamic import with ssr: false",
+            "file": "src/components/ThemeProvider.tsx",
+        },
+        {
+            "error": "Module not found: Can't resolve '@/components/ui/button'",
+            "cause": "Path alias not configured in tsconfig.json",
+            "fix": "Add paths mapping in tsconfig.json: '@/*': ['./src/*']",
+            "file": "tsconfig.json",
+        },
+        {
+            "error": "Warning: Each child in a list should have a unique 'key' prop",
+            "cause": "Missing key prop in .map() iteration",
+            "fix": "Add key={item.id} to the mapped JSX element",
+            "file": "src/components/ItemList.tsx",
+        },
+        {
+            "error": "TypeError: fetch failed - ECONNREFUSED",
+            "cause": "API endpoint is unreachable or CORS is not configured",
+            "fix": "Use Next.js API route as proxy, add CORS headers",
+            "file": "src/app/api/proxy/route.ts",
+        },
+        {
+            "error": "Error: Invalid hook call. Hooks can only be called inside of the body of a function component",
+            "cause": "Calling useState inside a conditional or nested function",
+            "fix": "Move hook call to the top level of the component function",
+            "file": "src/hooks/useAuth.ts",
+        },
+        {
+            "error": "Build error: Type 'string | undefined' is not assignable to type 'string'",
+            "cause": "Environment variable might be undefined at build time",
+            "fix": "Add non-null assertion or provide default value with ?? ''",
+            "file": "src/lib/config.ts",
+        },
+        {
+            "error": "Warning: validateDOMNesting(...): <div> cannot appear as a descendant of <p>",
+            "cause": "Invalid HTML nesting - block element inside inline element",
+            "fix": "Change outer <p> to <div> or inner <div> to <span>",
+            "file": "src/components/Card.tsx",
+        },
+        {
+            "error": "Error: NEXT_REDIRECT in API route",
+            "cause": "Using redirect() in a try/catch block catches the redirect error",
+            "fix": "Move redirect() call outside of try/catch, or re-throw NEXT_REDIRECT",
+            "file": "src/app/api/auth/route.ts",
+        },
+        {
+            "error": "Unhandled Runtime Error: Maximum update depth exceeded",
+            "cause": "useEffect dependency causes infinite re-render loop",
+            "fix": "Memoize the dependency with useMemo or useCallback",
+            "file": "src/hooks/useData.ts",
+        },
+    ]
+    for i in range(count):
+        scenario = error_scenarios[i % len(error_scenarios)]
+        assistant_content = (
+            f"<|think_start|>\n"
+            f"I see a build error. Let me analyze:\n"
+            f"Error: {scenario['error']}\n"
+            f"Root cause: {scenario['cause']}\n"
+            f"I need to fix this in {scenario['file']}\n"
+            f"<|think_end|>\n\n"
+            f"<|error_start|>\n"
+            f"File: {scenario['file']}\n"
+            f"Error: {scenario['error']}\n"
+            f"<|error_end|>\n\n"
+            f"<|sandbox_start|>\n"
+            f"Running: npm run build\n"
+            f"Status: FAILED\n"
+            f"Exit code: 1\n"
+            f"<|sandbox_end|>\n\n"
+            f"<|fix_start|>\n"
+            f"Root cause: {scenario['cause']}\n"
+            f"Solution: {scenario['fix']}\n"
+            f"<|fix_end|>\n\n"
+            f"<|file_start|>\npath: {scenario['file']}\nlanguage: typescript\n<|file_end|>\n\n"
+            f"<|code_start|>\n"
+            f"// Fixed version of {scenario['file']}\n"
+            f"// Applied fix: {scenario['fix']}\n"
+            f"export default function Fixed() {{\n"
+            f"  // Corrected implementation\n"
+            f"  return <div>Fixed component</div>;\n"
+            f"}}\n"
+            f"<|code_end|>\n\n"
+            f"<|sandbox_start|>\n"
+            f"Running: npm run build\n"
+            f"Status: SUCCESS\n"
+            f"Exit code: 0\n"
+            f"<|sandbox_end|>"
+        )
+        yield {
+            "id": f"sandbox_{i:06d}",
+            "source": "sandbox_examples",
+            "user": f"I'm getting this error: {scenario['error']}",
+            "assistant": assistant_content,
+            "error_type": scenario["error"][:50],
+        }
+        progress.update(task, completed=i + 1)
+def write_synthetic(
+    config: DatasetConfig,
+    checkpoint: CheckpointManager,
+    progress: Progress,
+) -> int:
+    """Generate and write synthetic data."""
+    output_path = DATA_RAW / config.output_file
+    generators = {
+        "synthetic_nextjs": generate_synthetic_nextjs,
+        "search_examples": generate_search_examples,
+        "sandbox_examples": generate_sandbox_examples,
+    }
+    gen_fn = generators[config.name]
+    count = 0
+    with open(output_path, "w", encoding="utf-8") as f:
+        for example in gen_fn(config.target_count, progress):
+            f.write(json.dumps(example, ensure_ascii=False) + "\n")
+            count += 1
+    size_mb = output_path.stat().st_size / (1024 * 1024)
+    log.info(f"✅ {config.name}: {count:,} examples, {size_mb:.1f} MB")
+    return count
+# ── Disk space check ──────────────────────────────────────────────────
+def check_disk_space(datasets: list[DatasetConfig]) -> bool:
+    """Verify enough disk space for planned downloads."""
+    import shutil
+    total_est_gb = sum(d.est_size_gb for d in datasets)
+    usage = shutil.disk_usage(str(DATA_RAW))
+    free_gb = usage.free / (1024 ** 3)
+    table = Table(title="💾 Disk Space Estimate")
+    table.add_column("Item", style="cyan")
+    table.add_column("Size", justify="right", style="green")
+    for d in datasets:
+        table.add_row(d.name, f"{d.est_size_gb:.2f} GB")
+    table.add_row("─" * 20, "─" * 10, style="dim")
+    table.add_row("Total estimated", f"{total_est_gb:.2f} GB", style="bold")
+    table.add_row("Available", f"{free_gb:.1f} GB", style="bold green")
+    table.add_row(
+        "After download",
+        f"~{free_gb - total_est_gb:.1f} GB",
+        style="bold yellow" if free_gb - total_est_gb > 50 else "bold red",
+    )
+    console.print(table)
+    if total_est_gb > free_gb * 0.8:
+        log.error(f"Not enough disk space! Need {total_est_gb:.1f} GB, have {free_gb:.1f} GB")
+        return False
+    return True
+# ── Main pipeline ─────────────────────────────────────────────────────
+def run_pipeline(
+    stage: Optional[int] = None,
+    dataset_name: Optional[str] = None,
+    synthetic_only: bool = False,
+) -> None:
+    """Run the download pipeline."""
+    console.print(Panel.fit(
+        "[bold cyan]MINDI 1.5 Vision-Coder — Dataset Download Pipeline[/]\n"
+        "[dim]Day 2 Step 1: Download 500K+ training examples[/]",
+        border_style="cyan",
+    ))
+    checkpoint = CheckpointManager()
+    # Filter datasets based on args
+    if dataset_name:
+        targets = [d for d in DATASETS if d.name == dataset_name]
+        if not targets:
+            log.error(f"Unknown dataset: {dataset_name}. Available: {[d.name for d in DATASETS]}")
+            return
+    elif synthetic_only:
+        targets = [d for d in DATASETS if d.is_synthetic]
+    elif stage is not None:
+        targets = [d for d in DATASETS if d.stage == stage or (stage == 0 and d.is_synthetic)]
+    else:
+        targets = DATASETS
+    # Show plan
+    plan_table = Table(title="📋 Download Plan")
+    plan_table.add_column("Dataset", style="cyan")
+    plan_table.add_column("Examples", justify="right")
+    plan_table.add_column("Est. Size", justify="right")
+    plan_table.add_column("Stage")
+    plan_table.add_column("Status")
+    for d in targets:
+        status = "✅ Done" if checkpoint.is_complete(d.name) else "⏳ Pending"
+        stage_label = f"Stage {d.stage}" if d.stage > 0 else "Synthetic"
+        plan_table.add_row(
+            d.name,
+            f"{d.target_count:,}",
+            f"{d.est_size_gb:.2f} GB",
+            stage_label,
+            status,
+        )
+    console.print(plan_table)
+    # Check disk space
+    pending = [d for d in targets if not checkpoint.is_complete(d.name)]
+    if not pending:
+        console.print("\n[bold green]✅ All requested datasets already downloaded![/]")
+        _print_summary(checkpoint)
+        return
+    if not check_disk_space(pending):
+        return
+    # Download with progress
+    console.print()
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        MofNCompleteColumn(),
+        TimeElapsedColumn(),
+        TimeRemainingColumn(),
+        console=console,
+        refresh_per_second=2,
+    ) as progress:
+        for config in pending:
+            if checkpoint.is_complete(config.name):
+                log.info(f"Skipping {config.name} (already complete)")
+                continue
+            log.info(f"\n{'─' * 50}")
+            log.info(f"Starting: {config.name} — {config.description}")
+            try:
+                if config.is_synthetic:
+                    count = write_synthetic(config, checkpoint, progress)
+                else:
+                    count = download_hf_dataset(config, checkpoint, progress)
+                size_mb = (DATA_RAW / config.output_file).stat().st_size / (1024 * 1024)
+                checkpoint.mark_complete(config.name, count, size_mb)
+            except KeyboardInterrupt:
+                log.warning(f"\n⚠️  Interrupted during {config.name}. Progress saved — rerun to resume.")
+                return
+            except Exception as e:
+                log.error(f"❌ Failed {config.name}: {e}")
+                log.error(traceback.format_exc())
+                continue
+    _print_summary(checkpoint)
+def _print_summary(checkpoint: CheckpointManager) -> None:
+    """Print final download summary."""
+    console.print()
+    summary = Table(title="📊 Download Summary")
+    summary.add_column("Dataset", style="cyan")
+    summary.add_column("Examples", justify="right")
+    summary.add_column("Size", justify="right")
+    summary.add_column("Time")
+    total_count = 0
+    total_mb = 0
+    for name, info in checkpoint.data["completed"].items():
+        summary.add_row(
+            name,
+            f"{info['count']:,}",
+            f"{info['size_mb']:.1f} MB",
+            info.get("timestamp", ""),
+        )
+        total_count += info["count"]
+        total_mb += info["size_mb"]
+    summary.add_row("─" * 20, "─" * 10, "─" * 10, "─" * 15, style="dim")
+    summary.add_row(
+        "[bold]TOTAL[/]",
+        f"[bold]{total_count:,}[/]",
+        f"[bold]{total_mb:.1f} MB[/]",
+        "",
+        style="bold green",
+    )
+    console.print(summary)
+    if total_count >= 500_000:
+        console.print("\n[bold green]🎉 TARGET REACHED: 500K+ examples downloaded![/]")
+    else:
+        remaining = 500_000 - total_count
+        console.print(f"\n[yellow]⏳ {remaining:,} more examples needed to reach 500K target[/]")
+# ── CLI ───────────────────────────────────────────────────────────────
+def main() -> None:
+    parser = argparse.ArgumentParser(description="MINDI Dataset Download Pipeline")
+    parser.add_argument("--dataset", type=str, help="Download a specific dataset by name")
+    parser.add_argument("--stage", type=int, choices=[0, 1, 2, 3], help="Download a specific stage")
+    parser.add_argument("--synthetic", action="store_true", help="Generate synthetic data only")
+    args = parser.parse_args()
+    run_pipeline(
+        stage=args.stage,
+        dataset_name=args.dataset,
+        synthetic_only=args.synthetic,
+    )
+if __name__ == "__main__":
+    main()

scripts/process_data.py ADDED Viewed

	@@ -0,0 +1,820 @@

+"""
+MINDI 1.5 Vision-Coder — Day 2 Step 2: MINDI Format Converter
+Converts ALL raw datasets (JSONL) into unified MINDI training format.
+Each output example:
+{
+  "id": "mindi_000001",
+  "type": "code_generation",
+  "source": "websight",
+  "messages": [
+    {"role": "system", "content": "..."},
+    {"role": "user",   "content": "..."},
+    {"role": "assistant", "content": "<|think_start|>...<|think_end|>..."}
+  ],
+  "metadata": {
+    "language": "typescript",
+    "framework": "nextjs",
+    "has_vision": false,
+    "tokens": 1024,
+    "quality_score": 8.5
+  }
+}
+Usage:
+    python scripts/process_data.py                     # Process all
+    python scripts/process_data.py --source codealpaca # Process one
+    python scripts/process_data.py --dry-run           # Preview only
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import logging
+import random
+import re
+import sys
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Generator, Optional
+from rich.console import Console
+from rich.logging import RichHandler
+from rich.panel import Panel
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+from rich.table import Table
+# ── Paths ─────────────────────────────────────────────────────────────
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+DATA_RAW = PROJECT_ROOT / "data" / "raw"
+DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
+LOGS_DIR = PROJECT_ROOT / "logs"
+TOKENIZER_PATH = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"
+DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
+LOGS_DIR.mkdir(parents=True, exist_ok=True)
+# ── Logging ───────────────────────────────────────────────────────────
+console = Console()
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(message)s",
+    datefmt="[%X]",
+    handlers=[
+        RichHandler(console=console, rich_tracebacks=True, show_path=False),
+        logging.FileHandler(LOGS_DIR / "process_data.log", encoding="utf-8"),
+    ],
+)
+log = logging.getLogger("mindi.process")
+# ── System prompt ─────────────────────────────────────────────────────
+MINDI_SYSTEM_PROMPT = (
+    "You are MINDI 1.5 Vision-Coder, an AI built by MINDIGENOUS.AI. "
+    "You are an expert in Next.js 14, React, TypeScript, Tailwind CSS, "
+    "and UI/UX design. You see your own output and critique it to make "
+    "it better for the user."
+)
+# ── Tokenizer (lazy loaded) ──────────────────────────────────────────
+_tokenizer = None
+def get_tokenizer():
+    global _tokenizer
+    if _tokenizer is None:
+        from transformers import AutoTokenizer
+        _tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_PATH), trust_remote_code=True)
+        log.info(f"Loaded tokenizer (vocab={len(_tokenizer):,})")
+    return _tokenizer
+def count_tokens(text: str) -> int:
+    tok = get_tokenizer()
+    return len(tok.encode(text, add_special_tokens=False))
+# ── Language detection ────────────────────────────────────────────────
+def detect_language(code: str, filename: str = "") -> str:
+    """Detect programming language from code content or filename."""
+    ext_map = {
+        ".py": "python", ".js": "javascript", ".jsx": "javascript",
+        ".ts": "typescript", ".tsx": "typescript", ".html": "html",
+        ".css": "css", ".json": "json", ".md": "markdown",
+        ".rs": "rust", ".go": "go", ".java": "java", ".cpp": "cpp",
+        ".c": "c", ".rb": "ruby", ".php": "php", ".swift": "swift",
+        ".kt": "kotlin", ".sql": "sql", ".sh": "bash",
+    }
+    if filename:
+        ext = Path(filename).suffix.lower()
+        if ext in ext_map:
+            return ext_map[ext]
+    # Heuristic detection from content
+    if "import React" in code or "from 'react'" in code or "jsx" in code.lower():
+        return "typescript" if ": " in code and ("interface " in code or "type " in code) else "javascript"
+    if "def " in code and "import " in code and ":" in code:
+        return "python"
+    if "func " in code and "package " in code:
+        return "go"
+    if "fn " in code and "let mut" in code:
+        return "rust"
+    if "public class" in code or "public static void" in code:
+        return "java"
+    if "<!DOCTYPE" in code or "<html" in code:
+        return "html"
+    if "function " in code or "const " in code or "=>" in code:
+        return "javascript"
+    return "unknown"
+def detect_framework(code: str) -> str:
+    """Detect framework from code content."""
+    if "'use client'" in code or "next/" in code or "Next" in code:
+        return "nextjs"
+    if "import React" in code or "from 'react'" in code:
+        return "react"
+    if "express" in code.lower():
+        return "express"
+    if "from flask" in code or "Flask(" in code:
+        return "flask"
+    if "from django" in code:
+        return "django"
+    if "import vue" in code.lower() or "defineComponent" in code:
+        return "vue"
+    return "none"
+# ── Quality scoring ──────────────────────────────────────────────────
+def score_quality(code: str, language: str) -> float:
+    """Score code quality on a 1-10 scale using heuristics."""
+    score = 5.0
+    # Length bonus (not too short, not just boilerplate)
+    lines = code.strip().splitlines()
+    if len(lines) >= 10:
+        score += 0.5
+    if len(lines) >= 30:
+        score += 0.5
+    if len(lines) < 3:
+        score -= 2.0
+    # Has comments/docstrings
+    if "//" in code or "/*" in code or '"""' in code or "'''" in code or "#" in code:
+        score += 0.5
+    # Has type annotations (TypeScript/Python)
+    if language in ("typescript", "python"):
+        if ":" in code and ("interface " in code or "type " in code or "-> " in code):
+            score += 0.5
+    # Has proper imports
+    if "import " in code or "from " in code or "require(" in code:
+        score += 0.3
+    # Has error handling
+    if "try" in code or "catch" in code or "except" in code:
+        score += 0.3
+    # Has exports (module structure)
+    if "export " in code or "module.exports" in code:
+        score += 0.3
+    # Penalize very short or empty
+    if len(code.strip()) < 50:
+        score -= 1.0
+    # Penalize obvious low quality
+    if code.count("TODO") > 3 or code.count("FIXME") > 3:
+        score -= 0.5
+    if "console.log" in code and code.count("console.log") > 5:
+        score -= 0.3
+    # Has proper function/class structure
+    if "function " in code or "class " in code or "def " in code or "const " in code:
+        score += 0.3
+    # Tailwind/CSS usage
+    if "className" in code or "tailwind" in code.lower():
+        score += 0.3
+    return max(1.0, min(10.0, round(score, 1)))
+# ── Converter: wrap code in MINDI format ─────────────────────────────
+def wrap_mindi_assistant(
+    code: str,
+    language: str = "typescript",
+    filename: str = "",
+    thinking: str = "",
+    critique: str = "",
+    suggestions: str = "",
+) -> str:
+    """Wrap code in MINDI special token format."""
+    parts = []
+    # Thinking block
+    if thinking:
+        parts.append(f"<|think_start|>\n{thinking}\n<|think_end|>")
+    # File metadata
+    if filename:
+        framework = detect_framework(code)
+        parts.append(f"<|file_start|>\npath: {filename}\nlanguage: {language}\nframework: {framework}\n<|file_end|>")
+    # Code block
+    parts.append(f"<|code_start|>\n{code.strip()}\n<|code_end|>")
+    # Critique
+    if critique:
+        parts.append(f"<|critique_start|>\n{critique}\n<|critique_end|>")
+    # Suggestions
+    if suggestions:
+        parts.append(f"<|suggest_start|>\n{suggestions}\n<|suggest_end|>")
+    return "\n\n".join(parts)
+def generate_thinking(user_request: str, language: str) -> str:
+    """Generate a basic thinking block from the user request."""
+    verbs = ["analyze", "implement", "create", "design", "build"]
+    verb = random.choice(verbs)
+    return (
+        f"The user wants me to {verb} something. Let me break this down:\n"
+        f"1. Understand the requirements from the request\n"
+        f"2. Choose the right approach for {language}\n"
+        f"3. Write clean, production-ready code\n"
+        f"4. Review for best practices and accessibility"
+    )
+def generate_critique(language: str, code: str) -> str:
+    """Generate a basic code critique."""
+    items = [
+        "✅ Code structure: Well-organized with clear separation of concerns",
+        "✅ Naming: Descriptive variable and function names",
+    ]
+    if language in ("typescript", "javascript"):
+        items.append("✅ Modern syntax: Uses ES6+ features appropriately")
+    if "className" in code:
+        items.append("✅ Styling: Tailwind CSS classes used correctly")
+    items.append("⚠️ Consider adding error handling for edge cases")
+    items.append("⚠️ Could benefit from unit tests")
+    return "Code Review:\n" + "\n".join(f"- {item}" for item in items)
+def generate_suggestions() -> str:
+    """Generate improvement suggestions."""
+    pool = [
+        "Add comprehensive error handling with try/catch",
+        "Implement loading and error states for better UX",
+        "Add TypeScript strict mode compliance",
+        "Write unit tests with Jest and Testing Library",
+        "Add JSDoc comments for public API",
+        "Consider extracting reusable hooks",
+        "Add proper aria attributes for accessibility",
+        "Implement responsive design breakpoints",
+        "Add performance optimization with useMemo/useCallback",
+        "Consider adding Storybook stories for documentation",
+    ]
+    selected = random.sample(pool, min(4, len(pool)))
+    return "Suggested improvements:\n" + "\n".join(f"{i+1}. {s}" for i, s in enumerate(selected))
+# ── Source-specific converters ────────────────────────────────────────
+def convert_codealpaca(raw: dict, idx: int) -> Optional[dict]:
+    """Convert CodeAlpaca example to MINDI format."""
+    instruction = raw.get("instruction", "").strip()
+    inp = raw.get("input", "").strip()
+    output = raw.get("output", "").strip()
+    if not instruction or not output:
+        return None
+    user_content = f"{instruction}\n{inp}".strip() if inp else instruction
+    language = detect_language(output)
+    quality = score_quality(output, language)
+    assistant_content = wrap_mindi_assistant(
+        code=output,
+        language=language,
+        thinking=generate_thinking(instruction, language),
+        critique=generate_critique(language, output),
+        suggestions=generate_suggestions(),
+    )
+    tokens = count_tokens(assistant_content)
+    return {
+        "id": f"mindi_{idx:06d}",
+        "type": "code_generation",
+        "source": "codealpaca",
+        "messages": [
+            {"role": "system", "content": MINDI_SYSTEM_PROMPT},
+            {"role": "user", "content": user_content},
+            {"role": "assistant", "content": assistant_content},
+        ],
+        "metadata": {
+            "language": language,
+            "framework": detect_framework(output),
+            "has_vision": False,
+            "tokens": tokens,
+            "quality_score": quality,
+        },
+    }
+def convert_codefeedback(raw: dict, idx: int) -> Optional[dict]:
+    """Convert CodeFeedback example to MINDI format."""
+    query = raw.get("query", "").strip()
+    answer = raw.get("answer", "").strip()
+    if not query or not answer:
+        return None
+    # Extract code blocks from answer if present
+    code_blocks = re.findall(r"```[\w]*\n(.*?)```", answer, re.DOTALL)
+    code = "\n\n".join(code_blocks) if code_blocks else answer
+    language = detect_language(code)
+    quality = score_quality(code, language)
+    assistant_content = wrap_mindi_assistant(
+        code=code,
+        language=language,
+        thinking=generate_thinking(query, language),
+        critique=generate_critique(language, code),
+        suggestions=generate_suggestions(),
+    )
+    tokens = count_tokens(assistant_content)
+    return {
+        "id": f"mindi_{idx:06d}",
+        "type": "code_generation",
+        "source": "codefeedback",
+        "messages": [
+            {"role": "system", "content": MINDI_SYSTEM_PROMPT},
+            {"role": "user", "content": query},
+            {"role": "assistant", "content": assistant_content},
+        ],
+        "metadata": {
+            "language": language,
+            "framework": detect_framework(code),
+            "has_vision": False,
+            "tokens": tokens,
+            "quality_score": quality,
+        },
+    }
+def convert_starcoderdata(raw: dict, idx: int) -> Optional[dict]:
+    """Convert StarCoder raw code to MINDI instruction format."""
+    content = raw.get("content", "").strip()
+    if not content or len(content) < 50:
+        return None
+    # Extract metadata
+    max_lines = raw.get("max_line_length", 0)
+    avg_line = raw.get("avg_line_length", 0)
+    language = detect_language(content)
+    quality = score_quality(content, language)
+    # Create a synthetic user request from the code
+    # Extract first comment or function/class name as context
+    first_lines = content[:500]
+    if "def " in first_lines:
+        match = re.search(r"def (\w+)", first_lines)
+        func_name = match.group(1) if match else "function"
+        user_request = f"Write a {language} function called `{func_name}` with proper implementation"
+    elif "class " in first_lines:
+        match = re.search(r"class (\w+)", first_lines)
+        class_name = match.group(1) if match else "Class"
+        user_request = f"Create a {language} class called `{class_name}` with full implementation"
+    elif "function " in first_lines or "const " in first_lines:
+        match = re.search(r"(?:function|const)\s+(\w+)", first_lines)
+        name = match.group(1) if match else "component"
+        user_request = f"Implement `{name}` in {language} with clean, modern code"
+    elif "export " in first_lines:
+        match = re.search(r"export\s+(?:default\s+)?(?:function|class|const)\s+(\w+)", first_lines)
+        name = match.group(1) if match else "module"
+        user_request = f"Build an exported {language} module `{name}`"
+    else:
+        user_request = f"Write this {language} code with best practices"
+    # Detect filename from content hints
+    filename = ""
+    if language == "python":
+        filename = "main.py"
+    elif language == "typescript":
+        filename = "index.tsx"
+    elif language == "javascript":
+        filename = "index.js"
+    assistant_content = wrap_mindi_assistant(
+        code=content,
+        language=language,
+        filename=filename,
+        thinking=generate_thinking(user_request, language),
+        critique=generate_critique(language, content),
+        suggestions=generate_suggestions(),
+    )
+    tokens = count_tokens(assistant_content)
+    return {
+        "id": f"mindi_{idx:06d}",
+        "type": "code_generation",
+        "source": "starcoderdata",
+        "messages": [
+            {"role": "system", "content": MINDI_SYSTEM_PROMPT},
+            {"role": "user", "content": user_request},
+            {"role": "assistant", "content": assistant_content},
+        ],
+        "metadata": {
+            "language": language,
+            "framework": detect_framework(content),
+            "has_vision": False,
+            "tokens": tokens,
+            "quality_score": quality,
+        },
+    }
+def convert_websight(raw: dict, idx: int) -> Optional[dict]:
+    """Convert WebSight HTML+screenshot to MINDI format."""
+    html = raw.get("text", "").strip()
+    if not html:
+        return None
+    # WebSight has HTML — we keep it as-is (conversion to JSX is a training objective)
+    language = "html"
+    quality = score_quality(html, language)
+    has_image = "image" in raw or "screenshot" in raw
+    user_request = "Convert this webpage design into a modern Next.js 14 component with Tailwind CSS"
+    thinking = (
+        "The user wants me to convert a web design to Next.js. I need to:\n"
+        "1. Analyze the HTML structure and visual layout\n"
+        "2. Convert HTML elements to React JSX syntax\n"
+        "3. Replace CSS classes with Tailwind CSS utilities\n"
+        "4. Add TypeScript types and proper component structure\n"
+        "5. Ensure responsive design and accessibility"
+    )
+    assistant_content = wrap_mindi_assistant(
+        code=html,
+        language="typescript",
+        filename="src/components/ConvertedPage.tsx",
+        thinking=thinking,
+        critique=generate_critique("typescript", html),
+        suggestions=generate_suggestions(),
+    )
+    tokens = count_tokens(assistant_content)
+    return {
+        "id": f"mindi_{idx:06d}",
+        "type": "vision_code",
+        "source": "websight",
+        "messages": [
+            {"role": "system", "content": MINDI_SYSTEM_PROMPT},
+            {"role": "user", "content": user_request},
+            {"role": "assistant", "content": assistant_content},
+        ],
+        "metadata": {
+            "language": "typescript",
+            "framework": "nextjs",
+            "has_vision": has_image,
+            "tokens": tokens,
+            "quality_score": quality,
+        },
+    }
+def convert_synthetic(raw: dict, idx: int) -> Optional[dict]:
+    """Convert synthetic data (already in near-MINDI format) to final format."""
+    user_content = raw.get("user", "").strip()
+    assistant_content = raw.get("assistant", "").strip()
+    source = raw.get("source", "synthetic")
+    if not user_content or not assistant_content:
+        return None
+    tokens = count_tokens(assistant_content)
+    language = raw.get("language", "typescript")
+    return {
+        "id": f"mindi_{idx:06d}",
+        "type": "code_generation" if "search" not in source else "search",
+        "source": source,
+        "messages": [
+            {"role": "system", "content": MINDI_SYSTEM_PROMPT},
+            {"role": "user", "content": user_content},
+            {"role": "assistant", "content": assistant_content},
+        ],
+        "metadata": {
+            "language": language,
+            "framework": raw.get("framework", "nextjs"),
+            "has_vision": False,
+            "tokens": tokens,
+            "quality_score": score_quality(assistant_content, language),
+        },
+    }
+def convert_evol_code(raw: dict, idx: int) -> Optional[dict]:
+    """Convert EvolInstruct-Code example to MINDI format."""
+    instruction = raw.get("instruction", "").strip()
+    output = raw.get("output", "").strip()
+    if not instruction or not output:
+        return None
+    code_blocks = re.findall(r"```[\w]*\n(.*?)```", output, re.DOTALL)
+    code = "\n\n".join(code_blocks) if code_blocks else output
+    language = detect_language(code)
+    quality = score_quality(code, language)
+    assistant_content = wrap_mindi_assistant(
+        code=code,
+        language=language,
+        thinking=generate_thinking(instruction, language),
+        critique=generate_critique(language, code),
+        suggestions=generate_suggestions(),
+    )
+    tokens = count_tokens(assistant_content)
+    return {
+        "id": f"mindi_{idx:06d}",
+        "type": "code_generation",
+        "source": "evol_code",
+        "messages": [
+            {"role": "system", "content": MINDI_SYSTEM_PROMPT},
+            {"role": "user", "content": instruction},
+            {"role": "assistant", "content": assistant_content},
+        ],
+        "metadata": {
+            "language": language,
+            "framework": detect_framework(code),
+            "has_vision": False,
+            "tokens": tokens,
+            "quality_score": quality,
+        },
+    }
+def convert_magicoder(raw: dict, idx: int) -> Optional[dict]:
+    """Convert Magicoder example to MINDI format."""
+    # Magicoder uses problem/solution or instruction/response
+    instruction = (raw.get("instruction", "") or raw.get("problem", "")).strip()
+    output = (raw.get("response", "") or raw.get("solution", "")).strip()
+    if not instruction or not output:
+        return None
+    code_blocks = re.findall(r"```[\w]*\n(.*?)```", output, re.DOTALL)
+    code = "\n\n".join(code_blocks) if code_blocks else output
+    language = detect_language(code)
+    quality = score_quality(code, language)
+    assistant_content = wrap_mindi_assistant(
+        code=code,
+        language=language,
+        thinking=generate_thinking(instruction, language),
+        critique=generate_critique(language, code),
+        suggestions=generate_suggestions(),
+    )
+    tokens = count_tokens(assistant_content)
+    return {
+        "id": f"mindi_{idx:06d}",
+        "type": "code_generation",
+        "source": "magicoder",
+        "messages": [
+            {"role": "system", "content": MINDI_SYSTEM_PROMPT},
+            {"role": "user", "content": instruction},
+            {"role": "assistant", "content": assistant_content},
+        ],
+        "metadata": {
+            "language": language,
+            "framework": detect_framework(code),
+            "has_vision": False,
+            "tokens": tokens,
+            "quality_score": quality,
+        },
+    }
+# ── Source registry ───────────────────────────────────────────────────
+SOURCE_CONVERTERS = {
+    "codealpaca": ("codealpaca.jsonl", convert_codealpaca),
+    "codefeedback": ("codefeedback.jsonl", convert_codefeedback),
+    "starcoder_python": ("starcoder_python.jsonl", convert_starcoderdata),
+    "starcoder_javascript": ("starcoder_javascript.jsonl", convert_starcoderdata),
+    "starcoder_typescript": ("starcoder_typescript.jsonl", convert_starcoderdata),
+    "starcoder_css": ("starcoder_css.jsonl", convert_starcoderdata),
+    "starcoder_html": ("starcoder_html.jsonl", convert_starcoderdata),
+    "evol_code": ("evol_code.jsonl", convert_evol_code),
+    "magicoder": ("magicoder.jsonl", convert_magicoder),
+    "websight": ("websight.jsonl", convert_websight),
+    "synthetic_nextjs": ("synthetic_nextjs.jsonl", convert_synthetic),
+    "search_examples": ("search_examples.jsonl", convert_synthetic),
+    "sandbox_examples": ("sandbox_examples.jsonl", convert_synthetic),
+}
+OUTPUT_FILE = DATA_PROCESSED / "mindi_all.jsonl"
+# ── Main processing pipeline ─────────────────────────────────────────
+def process_source(
+    source_name: str,
+    global_idx: int,
+    progress: Progress,
+    dry_run: bool = False,
+) -> tuple[int, int, int]:
+    """Process one source, return (converted, skipped, global_idx)."""
+    if source_name not in SOURCE_CONVERTERS:
+        log.error(f"Unknown source: {source_name}")
+        return 0, 0, global_idx
+    filename, converter = SOURCE_CONVERTERS[source_name]
+    input_path = DATA_RAW / filename
+    if not input_path.exists():
+        log.warning(f"⏭️  Skipping {source_name}: {input_path} not found (download first)")
+        return 0, 0, global_idx
+    # Count lines for progress
+    total_lines = sum(1 for _ in open(input_path, encoding="utf-8"))
+    task = progress.add_task(f"[cyan]{source_name}", total=total_lines)
+    converted = 0
+    skipped = 0
+    output_handle = None
+    if not dry_run:
+        # Append mode so we can process sources incrementally
+        output_handle = open(OUTPUT_FILE, "a", encoding="utf-8")
+    try:
+        with open(input_path, "r", encoding="utf-8") as f:
+            for line_num, line in enumerate(f):
+                line = line.strip()
+                if not line:
+                    progress.update(task, advance=1)
+                    continue
+                try:
+                    raw = json.loads(line)
+                except json.JSONDecodeError:
+                    skipped += 1
+                    progress.update(task, advance=1)
+                    continue
+                result = converter(raw, global_idx)
+                if result is None:
+                    skipped += 1
+                else:
+                    if not dry_run and output_handle:
+                        output_handle.write(json.dumps(result, ensure_ascii=False) + "\n")
+                    converted += 1
+                    global_idx += 1
+                progress.update(task, advance=1)
+                # Flush periodically
+                if not dry_run and output_handle and converted % 5000 == 0:
+                    output_handle.flush()
+    finally:
+        if output_handle:
+            output_handle.close()
+    log.info(f"{'[DRY RUN] ' if dry_run else ''}✅ {source_name}: {converted:,} converted, {skipped:,} skipped")
+    return converted, skipped, global_idx
+def run_processing(
+    source: Optional[str] = None,
+    dry_run: bool = False,
+) -> None:
+    """Run the full processing pipeline."""
+    console.print(Panel.fit(
+        "[bold cyan]MINDI 1.5 Vision-Coder — MINDI Format Converter[/]\n"
+        "[dim]Day 2 Step 2: Convert raw datasets to MINDI training format[/]",
+        border_style="cyan",
+    ))
+    # Determine sources to process
+    if source:
+        sources = [source]
+    else:
+        sources = list(SOURCE_CONVERTERS.keys())
+    # Show available files
+    available_table = Table(title="📁 Raw Data Files")
+    available_table.add_column("Source", style="cyan")
+    available_table.add_column("File")
+    available_table.add_column("Exists")
+    available_table.add_column("Size")
+    for src in sources:
+        fname, _ = SOURCE_CONVERTERS[src]
+        fpath = DATA_RAW / fname
+        exists = fpath.exists()
+        size = f"{fpath.stat().st_size / (1024*1024):.1f} MB" if exists else "—"
+        available_table.add_row(src, fname, "✅" if exists else "❌", size)
+    console.print(available_table)
+    # Count existing examples in output file to resume from correct ID
+    existing_count = 0
+    if OUTPUT_FILE.exists():
+        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
+            existing_count = sum(1 for _ in f)
+        log.info(f"📄 Existing mindi_all.jsonl has {existing_count:,} examples — appending new data")
+    # Process each source
+    total_converted = 0
+    total_skipped = 0
+    global_idx = existing_count
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        MofNCompleteColumn(),
+        TimeElapsedColumn(),
+        TimeRemainingColumn(),
+        console=console,
+        refresh_per_second=2,
+    ) as progress:
+        for src in sources:
+            converted, skipped, global_idx = process_source(
+                src, global_idx, progress, dry_run=dry_run
+            )
+            total_converted += converted
+            total_skipped += skipped
+    # Summary
+    console.print()
+    summary = Table(title="📊 Processing Summary")
+    summary.add_column("Metric", style="cyan")
+    summary.add_column("Value", justify="right", style="green")
+    summary.add_row("Previously existing", f"{existing_count:,}")
+    summary.add_row("Newly converted", f"{total_converted:,}")
+    summary.add_row("Total skipped", f"{total_skipped:,}")
+    grand_total = existing_count + total_converted
+    summary.add_row("[bold]Grand total[/]", f"[bold]{grand_total:,}[/]")
+    summary.add_row("Global ID range", f"mindi_000000 → mindi_{global_idx - 1:06d}")
+    if not dry_run and OUTPUT_FILE.exists():
+        size_mb = OUTPUT_FILE.stat().st_size / (1024 * 1024)
+        summary.add_row("Output file", str(OUTPUT_FILE.relative_to(PROJECT_ROOT)))
+        summary.add_row("Output size", f"{size_mb:.1f} MB")
+    console.print(summary)
+    if grand_total >= 500_000:
+        console.print("\n[bold green]🎉 TARGET REACHED: 500K+ examples in MINDI format![/]")
+    elif grand_total > 0:
+        remaining = 500_000 - grand_total
+        console.print(f"\n[yellow]⏳ {grand_total:,} total examples ({remaining:,} more needed for 500K target)[/]")
+    else:
+        console.print("\n[yellow]⚠️  No examples converted — download raw data first (scripts/download_datasets.py)[/]")
+# ── CLI ───────────────────────────────────────────────────────────────
+def main() -> None:
+    parser = argparse.ArgumentParser(description="MINDI Format Converter")
+    parser.add_argument("--source", type=str, help="Process a specific source only")
+    parser.add_argument("--dry-run", action="store_true", help="Preview without writing output")
+    args = parser.parse_args()
+    run_processing(source=args.source, dry_run=args.dry_run)
+if __name__ == "__main__":
+    main()