Faaz commited on
Commit
59c6c97
Β·
1 Parent(s): 11e0d89

Day 2 COMPLETE: 1.48M examples processed, 6GB dataset, WebSight done

Browse files
scripts/download_datasets.py ADDED
@@ -0,0 +1,891 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MINDI 1.5 Vision-Coder β€” Day 2 Step 1: Dataset Download Pipeline
3
+
4
+ Downloads 7 datasets (500K+ examples total) with:
5
+ - Rich progress bars
6
+ - Network retry with exponential backoff
7
+ - Checkpoint/resume support
8
+ - Disk space estimation
9
+ - Logging to logs/download.log
10
+ - Running total of examples
11
+
12
+ Usage:
13
+ python scripts/download_datasets.py # Download all
14
+ python scripts/download_datasets.py --dataset websight # Download one
15
+ python scripts/download_datasets.py --stage 1 # Stage 1 only (small/fast)
16
+ python scripts/download_datasets.py --stage 2 # Stage 2 (starcoder)
17
+ python scripts/download_datasets.py --stage 3 # Stage 3 (websight)
18
+ python scripts/download_datasets.py --synthetic # Synthetic only
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import hashlib
25
+ import json
26
+ import logging
27
+ import os
28
+ import random
29
+ import sys
30
+ import time
31
+ import traceback
32
+ from dataclasses import dataclass, field
33
+ from pathlib import Path
34
+ from typing import Any, Generator, Optional
35
+
36
+ from rich.console import Console
37
+ from rich.logging import RichHandler
38
+ from rich.panel import Panel
39
+ from rich.progress import (
40
+ BarColumn,
41
+ MofNCompleteColumn,
42
+ Progress,
43
+ SpinnerColumn,
44
+ TextColumn,
45
+ TimeElapsedColumn,
46
+ TimeRemainingColumn,
47
+ )
48
+ from rich.table import Table
49
+
50
+ # ── Project paths ─────────────────────────────────────────────────────
51
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
52
+ DATA_RAW = PROJECT_ROOT / "data" / "raw"
53
+ LOGS_DIR = PROJECT_ROOT / "logs"
54
+ CHECKPOINT_FILE = DATA_RAW / ".download_checkpoint.json"
55
+
56
+ DATA_RAW.mkdir(parents=True, exist_ok=True)
57
+ LOGS_DIR.mkdir(parents=True, exist_ok=True)
58
+
59
+ # ── Logging ───────────────────────────────────────────────────────────
60
+ console = Console()
61
+
62
+ logging.basicConfig(
63
+ level=logging.INFO,
64
+ format="%(message)s",
65
+ datefmt="[%X]",
66
+ handlers=[
67
+ RichHandler(console=console, rich_tracebacks=True, show_path=False),
68
+ logging.FileHandler(LOGS_DIR / "download.log", encoding="utf-8"),
69
+ ],
70
+ )
71
+ log = logging.getLogger("mindi.download")
72
+
73
+
74
+ # ── Checkpoint manager ────────────────────────────────────────────────
75
+ class CheckpointManager:
76
+ """Tracks which datasets are complete so downloads can resume."""
77
+
78
+ def __init__(self, path: Path = CHECKPOINT_FILE) -> None:
79
+ self.path = path
80
+ self.data: dict[str, Any] = self._load()
81
+
82
+ def _load(self) -> dict[str, Any]:
83
+ if self.path.exists():
84
+ return json.loads(self.path.read_text(encoding="utf-8"))
85
+ return {"completed": {}, "in_progress": {}}
86
+
87
+ def save(self) -> None:
88
+ self.path.write_text(json.dumps(self.data, indent=2), encoding="utf-8")
89
+
90
+ def is_complete(self, name: str) -> bool:
91
+ return name in self.data["completed"]
92
+
93
+ def mark_complete(self, name: str, count: int, size_mb: float) -> None:
94
+ self.data["completed"][name] = {
95
+ "count": count,
96
+ "size_mb": round(size_mb, 2),
97
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
98
+ }
99
+ self.data["in_progress"].pop(name, None)
100
+ self.save()
101
+
102
+ def mark_in_progress(self, name: str, count: int) -> None:
103
+ self.data["in_progress"][name] = {"count": count}
104
+ self.save()
105
+
106
+ def get_resume_count(self, name: str) -> int:
107
+ return self.data.get("in_progress", {}).get(name, {}).get("count", 0)
108
+
109
+ def get_total_examples(self) -> int:
110
+ return sum(v["count"] for v in self.data["completed"].values())
111
+
112
+
113
+ # ── Dataset definitions ───────────────────────────────────────────────
114
+ @dataclass
115
+ class DatasetConfig:
116
+ name: str
117
+ hf_name: str
118
+ hf_subset: Optional[str]
119
+ hf_split: str
120
+ target_count: int
121
+ output_file: str
122
+ stage: int
123
+ est_size_gb: float
124
+ description: str
125
+ languages: list[str] = field(default_factory=list)
126
+ is_synthetic: bool = False
127
+
128
+
129
+ DATASETS: list[DatasetConfig] = [
130
+ # Stage 1 β€” Small/fast (5-10 min)
131
+ DatasetConfig(
132
+ name="codealpaca",
133
+ hf_name="sahil2801/CodeAlpaca-20k",
134
+ hf_subset=None,
135
+ hf_split="train",
136
+ target_count=20_000,
137
+ output_file="codealpaca.jsonl",
138
+ stage=1,
139
+ est_size_gb=0.05,
140
+ description="Code instruction-following pairs",
141
+ ),
142
+ DatasetConfig(
143
+ name="codefeedback",
144
+ hf_name="m-a-p/CodeFeedback-Filtered-Instruction",
145
+ hf_subset=None,
146
+ hf_split="train",
147
+ target_count=50_000,
148
+ output_file="codefeedback.jsonl",
149
+ stage=1,
150
+ est_size_gb=0.3,
151
+ description="Code with human feedback",
152
+ ),
153
+ # Stage 2 β€” Medium (1-2 hours)
154
+ DatasetConfig(
155
+ name="starcoder_python",
156
+ hf_name="bigcode/starcoderdata",
157
+ hf_subset="python",
158
+ hf_split="train",
159
+ target_count=100_000,
160
+ output_file="starcoderdata.jsonl",
161
+ stage=2,
162
+ est_size_gb=2.0,
163
+ description="StarCoder Python code",
164
+ languages=["python"],
165
+ ),
166
+ DatasetConfig(
167
+ name="starcoder_javascript",
168
+ hf_name="bigcode/starcoderdata",
169
+ hf_subset="javascript",
170
+ hf_split="train",
171
+ target_count=100_000,
172
+ output_file="starcoderdata.jsonl", # appends to same file
173
+ stage=2,
174
+ est_size_gb=2.0,
175
+ description="StarCoder JavaScript code",
176
+ languages=["javascript"],
177
+ ),
178
+ DatasetConfig(
179
+ name="starcoder_typescript",
180
+ hf_name="bigcode/starcoderdata",
181
+ hf_subset="typescript",
182
+ hf_split="train",
183
+ target_count=50_000,
184
+ output_file="starcoderdata.jsonl", # appends to same file
185
+ stage=2,
186
+ est_size_gb=1.0,
187
+ description="StarCoder TypeScript code",
188
+ languages=["typescript"],
189
+ ),
190
+ # Stage 3 β€” Large (overnight)
191
+ DatasetConfig(
192
+ name="websight",
193
+ hf_name="HuggingFaceM4/WebSight",
194
+ hf_subset="v0.2",
195
+ hf_split="train",
196
+ target_count=200_000,
197
+ output_file="websight.jsonl",
198
+ stage=3,
199
+ est_size_gb=8.0,
200
+ description="Screenshots + HTML code pairs",
201
+ ),
202
+ # Synthetic β€” No download needed
203
+ DatasetConfig(
204
+ name="synthetic_nextjs",
205
+ hf_name="",
206
+ hf_subset=None,
207
+ hf_split="",
208
+ target_count=30_000,
209
+ output_file="synthetic_nextjs.jsonl",
210
+ stage=0,
211
+ est_size_gb=0.2,
212
+ description="Synthetic Next.js components with MINDI format",
213
+ is_synthetic=True,
214
+ ),
215
+ DatasetConfig(
216
+ name="search_examples",
217
+ hf_name="",
218
+ hf_subset=None,
219
+ hf_split="",
220
+ target_count=5_000,
221
+ output_file="search_examples.jsonl",
222
+ stage=0,
223
+ est_size_gb=0.03,
224
+ description="MINDI search usage examples",
225
+ is_synthetic=True,
226
+ ),
227
+ DatasetConfig(
228
+ name="sandbox_examples",
229
+ hf_name="",
230
+ hf_subset=None,
231
+ hf_split="",
232
+ target_count=3_000,
233
+ output_file="sandbox_examples.jsonl",
234
+ stage=0,
235
+ est_size_gb=0.02,
236
+ description="MINDI sandbox error-fix examples",
237
+ is_synthetic=True,
238
+ ),
239
+ ]
240
+
241
+
242
+ # ── Retry helper ──────────────────────────────────────────────────────
243
+ def retry_with_backoff(fn, max_retries: int = 5, base_delay: float = 2.0):
244
+ """Call fn() with exponential backoff on failure."""
245
+ for attempt in range(max_retries):
246
+ try:
247
+ return fn()
248
+ except Exception as e:
249
+ if attempt == max_retries - 1:
250
+ raise
251
+ delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
252
+ log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s...")
253
+ time.sleep(delay)
254
+
255
+
256
+ # ── HuggingFace download ─────────────────────────────────────────────
257
+ def download_hf_dataset(
258
+ config: DatasetConfig,
259
+ checkpoint: CheckpointManager,
260
+ progress: Progress,
261
+ ) -> int:
262
+ """Download a HuggingFace dataset with streaming and save as JSONL."""
263
+ from datasets import load_dataset
264
+
265
+ output_path = DATA_RAW / config.output_file
266
+ resume_count = checkpoint.get_resume_count(config.name)
267
+
268
+ # For starcoder subsets that share an output file, use append mode
269
+ # but only if this specific subset hasn't been completed
270
+ is_append = config.output_file == "starcoderdata.jsonl" and output_path.exists()
271
+ mode = "a" if is_append else "w"
272
+ if not is_append and resume_count == 0:
273
+ mode = "w"
274
+ elif resume_count > 0:
275
+ mode = "a"
276
+ log.info(f"Resuming {config.name} from example {resume_count:,}")
277
+
278
+ task = progress.add_task(
279
+ f"[cyan]{config.name}",
280
+ total=config.target_count,
281
+ completed=resume_count,
282
+ )
283
+
284
+ log.info(f"Loading {config.hf_name} (subset={config.hf_subset}, split={config.hf_split}) streaming=True")
285
+
286
+ def _load():
287
+ kwargs = {
288
+ "path": config.hf_name,
289
+ "split": config.hf_split,
290
+ "streaming": True,
291
+ "trust_remote_code": True,
292
+ }
293
+ if config.hf_subset:
294
+ kwargs["name"] = config.hf_subset
295
+ return load_dataset(**kwargs)
296
+
297
+ ds = retry_with_backoff(_load)
298
+
299
+ count = 0
300
+ skipped = 0
301
+ with open(output_path, mode, encoding="utf-8") as f:
302
+ for example in ds:
303
+ if count < resume_count:
304
+ count += 1
305
+ continue
306
+
307
+ # Write raw example as JSONL
308
+ try:
309
+ line = json.dumps(example, ensure_ascii=False, default=str)
310
+ f.write(line + "\n")
311
+ except (TypeError, ValueError) as e:
312
+ skipped += 1
313
+ continue
314
+
315
+ count += 1
316
+ progress.update(task, completed=count)
317
+
318
+ # Periodic checkpoint every 5000 examples
319
+ if count % 5000 == 0:
320
+ checkpoint.mark_in_progress(config.name, count)
321
+ f.flush()
322
+
323
+ if count >= config.target_count:
324
+ break
325
+
326
+ size_mb = output_path.stat().st_size / (1024 * 1024)
327
+ log.info(f"βœ… {config.name}: {count:,} examples, {size_mb:.1f} MB (skipped {skipped})")
328
+ progress.update(task, completed=count)
329
+ return count
330
+
331
+
332
+ # ── Synthetic generators ──────────────────────────────────────────────
333
+
334
+ # Component templates for synthetic Next.js data
335
+ COMPONENT_TYPES = [
336
+ "Navbar", "Hero", "Footer", "Sidebar", "Card", "Modal", "Dropdown",
337
+ "Accordion", "Tabs", "Carousel", "Pagination", "Breadcrumb", "Alert",
338
+ "Toast", "Badge", "Avatar", "Tooltip", "Popover", "Progress", "Spinner",
339
+ "Skeleton", "Table", "Form", "Input", "Select", "Checkbox", "Radio",
340
+ "Switch", "Slider", "DatePicker", "FileUpload", "SearchBar", "CommandPalette",
341
+ "DataTable", "Chart", "Calendar", "Timeline", "Stepper", "Rating",
342
+ "PricingCard", "TestimonialCard", "FeatureGrid", "StatsSection",
343
+ "CTASection", "Newsletter", "LoginForm", "SignupForm", "ProfileCard",
344
+ "DashboardLayout", "SettingsPanel", "NotificationList", "ChatBubble",
345
+ ]
346
+
347
+ TAILWIND_COLORS = [
348
+ "slate", "gray", "zinc", "neutral", "stone", "red", "orange", "amber",
349
+ "yellow", "lime", "green", "emerald", "teal", "cyan", "sky", "blue",
350
+ "indigo", "violet", "purple", "fuchsia", "pink", "rose",
351
+ ]
352
+
353
+ DESIGN_PATTERNS = [
354
+ "responsive grid layout", "flexbox centering", "gradient background",
355
+ "glassmorphism effect", "dark mode support", "animated entrance",
356
+ "hover transitions", "skeleton loading state", "error boundary",
357
+ "lazy loading", "infinite scroll", "drag and drop", "keyboard navigation",
358
+ "focus management", "scroll animations", "parallax effect",
359
+ ]
360
+
361
+ USER_REQUESTS = [
362
+ "Build me a {component} component with {pattern}",
363
+ "Create a modern {component} using Tailwind CSS with {color} theme",
364
+ "I need a {component} that supports dark mode and is fully accessible",
365
+ "Design a {component} with smooth animations and {pattern}",
366
+ "Make a responsive {component} component for a SaaS dashboard",
367
+ "Build a {component} with TypeScript and proper prop types",
368
+ "Create a reusable {component} with {pattern} for a landing page",
369
+ "I want a {component} that looks like the latest {color} design trend",
370
+ "Generate a production-ready {component} with {pattern}",
371
+ "Build a {component} component with Framer Motion animations",
372
+ ]
373
+
374
+ CRITIQUE_TEMPLATES = [
375
+ "Visual Analysis:\n- βœ… Layout: Clean {pattern} implementation\n- βœ… Typography: Proper hierarchy with {color} accent colors\n- ⚠️ Accessibility: Consider adding aria-labels to interactive elements\n- βœ… Responsiveness: Works across breakpoints",
376
+ "Design Review:\n- βœ… Color scheme: {color} palette creates good visual harmony\n- βœ… Spacing: Consistent padding and margins\n- ⚠️ Touch targets: Buttons should be at least 44px for mobile\n- βœ… Visual hierarchy: Clear flow from header to content",
377
+ "UI/UX Assessment:\n- βœ… {pattern}: Well implemented with smooth transitions\n- βœ… Contrast: Text is readable against background\n- ⚠️ Loading state: Consider adding skeleton screens\n- βœ… Component structure: Clean separation of concerns",
378
+ ]
379
+
380
+ SUGGEST_TEMPLATES = [
381
+ "Improvements for next iteration:\n1. Add aria-label attributes for screen readers\n2. Implement keyboard navigation (Tab, Enter, Escape)\n3. Add loading skeleton state\n4. Consider adding subtle micro-interactions on hover",
382
+ "Suggestions:\n1. Add error boundary wrapper for production safety\n2. Implement responsive breakpoints for sm/md/lg/xl\n3. Add unit tests with @testing-library/react\n4. Consider extracting reusable hooks for state logic",
383
+ "Next steps:\n1. Add dark mode toggle using next-themes\n2. Optimize images with next/image component\n3. Add Storybook stories for documentation\n4. Implement proper TypeScript discriminated unions for variants",
384
+ ]
385
+
386
+
387
+ def _generate_code_block(component: str, color: str) -> str:
388
+ """Generate a realistic Next.js component code block."""
389
+ props_name = f"{component}Props"
390
+ variants = ["default", "primary", "secondary", "outline", "ghost"]
391
+ variant = random.choice(variants)
392
+
393
+ code = f"""'use client';
394
+
395
+ import {{ useState }} from 'react';
396
+ import {{ cn }} from '@/lib/utils';
397
+
398
+ interface {props_name} {{
399
+ variant?: '{variant}' | 'default';
400
+ className?: string;
401
+ children?: React.ReactNode;
402
+ }}
403
+
404
+ export default function {component}({{ variant = 'default', className, children }}: {props_name}) {{
405
+ const [isActive, setIsActive] = useState(false);
406
+
407
+ return (
408
+ <div
409
+ className={{cn(
410
+ 'rounded-lg border p-4 transition-all duration-200',
411
+ variant === '{variant}' && 'bg-{color}-50 border-{color}-200 text-{color}-900',
412
+ variant === 'default' && 'bg-white border-gray-200 text-gray-900',
413
+ isActive && 'ring-2 ring-{color}-500 shadow-lg',
414
+ className
415
+ )}}
416
+ onClick={{() => setIsActive(!isActive)}}
417
+ role="button"
418
+ tabIndex={{0}}
419
+ onKeyDown={{(e) => e.key === 'Enter' && setIsActive(!isActive)}}
420
+ >
421
+ <div className="flex items-center justify-between">
422
+ <h3 className="text-lg font-semibold">{component}</h3>
423
+ <span className="text-sm text-{color}-600">{{variant}}</span>
424
+ </div>
425
+ <div className="mt-2 text-sm text-gray-600">
426
+ {{children}}
427
+ </div>
428
+ </div>
429
+ );
430
+ }}"""
431
+ return code
432
+
433
+
434
+ def generate_synthetic_nextjs(count: int, progress: Progress) -> Generator[dict, None, None]:
435
+ """Generate synthetic Next.js training examples in MINDI format."""
436
+ task = progress.add_task("[magenta]synthetic_nextjs", total=count)
437
+
438
+ for i in range(count):
439
+ component = random.choice(COMPONENT_TYPES)
440
+ color = random.choice(TAILWIND_COLORS)
441
+ pattern = random.choice(DESIGN_PATTERNS)
442
+
443
+ request_template = random.choice(USER_REQUESTS)
444
+ user_request = request_template.format(
445
+ component=component, color=color, pattern=pattern
446
+ )
447
+
448
+ code = _generate_code_block(component, color)
449
+ filename = f"src/components/{component}.tsx"
450
+
451
+ thinking = (
452
+ f"The user wants a {component} component. I need to:\n"
453
+ f"1. Create a TypeScript component with proper prop types\n"
454
+ f"2. Use Tailwind CSS with {color} color scheme\n"
455
+ f"3. Implement {pattern}\n"
456
+ f"4. Ensure accessibility with ARIA attributes\n"
457
+ f"5. Add keyboard navigation support"
458
+ )
459
+
460
+ critique = random.choice(CRITIQUE_TEMPLATES).format(
461
+ pattern=pattern, color=color
462
+ )
463
+ suggestions = random.choice(SUGGEST_TEMPLATES)
464
+
465
+ assistant_content = (
466
+ f"<|think_start|>\n{thinking}\n<|think_end|>\n\n"
467
+ f"<|file_start|>\npath: {filename}\nlanguage: typescript\nframework: next.js 14\n<|file_end|>\n\n"
468
+ f"<|code_start|>\n{code}\n<|code_end|>\n\n"
469
+ f"<|critique_start|>\n{critique}\n<|critique_end|>\n\n"
470
+ f"<|suggest_start|>\n{suggestions}\n<|suggest_end|>"
471
+ )
472
+
473
+ yield {
474
+ "id": f"synthetic_{i:06d}",
475
+ "source": "synthetic_nextjs",
476
+ "user": user_request,
477
+ "assistant": assistant_content,
478
+ "component": component,
479
+ "language": "typescript",
480
+ "framework": "nextjs",
481
+ }
482
+
483
+ progress.update(task, completed=i + 1)
484
+
485
+
486
+ def generate_search_examples(count: int, progress: Progress) -> Generator[dict, None, None]:
487
+ """Generate synthetic search usage examples."""
488
+ task = progress.add_task("[yellow]search_examples", total=count)
489
+
490
+ search_scenarios = [
491
+ ("How to implement dark mode in Next.js 14?", "next.js 14 dark mode implementation next-themes"),
492
+ ("Best practices for React form validation", "react form validation zod react-hook-form 2025"),
493
+ ("How to set up authentication in Next.js?", "next.js 14 authentication NextAuth.js credentials"),
494
+ ("Tailwind CSS animation examples", "tailwind css animation keyframes framer-motion"),
495
+ ("How to optimize images in Next.js?", "next.js image optimization next/image blur placeholder"),
496
+ ("React server components best practices", "react server components RSC data fetching patterns"),
497
+ ("How to deploy Next.js to Vercel?", "next.js 14 vercel deployment environment variables"),
498
+ ("TypeScript utility types for React", "typescript react utility types ComponentProps PropsWithChildren"),
499
+ ("How to use Zustand for state management?", "zustand state management react next.js middleware"),
500
+ ("CSS Grid vs Flexbox for layouts", "css grid flexbox responsive layout patterns 2025"),
501
+ ("How to implement infinite scroll?", "react infinite scroll intersection observer tanstack query"),
502
+ ("Next.js API routes best practices", "next.js 14 route handlers API validation zod"),
503
+ ("How to add SEO to Next.js?", "next.js 14 metadata SEO generateMetadata open graph"),
504
+ ("React testing best practices", "react testing library jest vitest component testing"),
505
+ ("How to use Prisma with Next.js?", "prisma next.js 14 database postgresql schema"),
506
+ ]
507
+
508
+ packages_db = [
509
+ ("framer-motion", "Production-ready motion library for React", "npm i framer-motion"),
510
+ ("next-themes", "Dark mode for Next.js apps", "npm i next-themes"),
511
+ ("zustand", "Small, fast state management", "npm i zustand"),
512
+ ("@tanstack/react-query", "Powerful data synchronization", "npm i @tanstack/react-query"),
513
+ ("react-hook-form", "Performant forms with validation", "npm i react-hook-form"),
514
+ ("zod", "TypeScript-first schema validation", "npm i zod"),
515
+ ("tailwind-merge", "Merge Tailwind classes without conflicts", "npm i tailwind-merge"),
516
+ ("clsx", "Tiny utility for constructing className strings", "npm i clsx"),
517
+ ("lucide-react", "Beautiful SVG icons for React", "npm i lucide-react"),
518
+ ("@radix-ui/react-dialog", "Accessible dialog component", "npm i @radix-ui/react-dialog"),
519
+ ]
520
+
521
+ for i in range(count):
522
+ scenario = search_scenarios[i % len(search_scenarios)]
523
+ pkg = packages_db[i % len(packages_db)]
524
+ user_q = scenario[0]
525
+ search_query = scenario[1]
526
+
527
+ assistant_content = (
528
+ f"<|think_start|>\nThe user is asking about {user_q.lower().rstrip('?')}. "
529
+ f"Let me search for the latest best practices.\n<|think_end|>\n\n"
530
+ f"<|search_start|>\nquery: \"{search_query}\"\n"
531
+ f"results: [\n"
532
+ f" {{\"title\": \"Official Documentation\", \"url\": \"https://docs.example.com\", \"snippet\": \"Comprehensive guide...\"}},\n"
533
+ f" {{\"title\": \"Best Practices 2025\", \"url\": \"https://blog.example.com\", \"snippet\": \"Updated approach...\"}}\n"
534
+ f"]\n<|search_end|>\n\n"
535
+ f"Based on my research, here's the recommended approach:\n\n"
536
+ f"First, install the required package:\n```bash\n{pkg[2]}\n```\n\n"
537
+ f"**{pkg[0]}** β€” {pkg[1]}\n\n"
538
+ f"<|code_start|>\n"
539
+ f"// Example usage of {pkg[0]}\n"
540
+ f"import {{ /* relevant imports */ }} from '{pkg[0]}';\n\n"
541
+ f"export default function Example() {{\n"
542
+ f" // Implementation based on search results\n"
543
+ f" return <div>Example using {pkg[0]}</div>;\n"
544
+ f"}}\n"
545
+ f"<|code_end|>"
546
+ )
547
+
548
+ yield {
549
+ "id": f"search_{i:06d}",
550
+ "source": "search_examples",
551
+ "user": user_q,
552
+ "assistant": assistant_content,
553
+ "search_query": search_query,
554
+ }
555
+
556
+ progress.update(task, completed=i + 1)
557
+
558
+
559
+ def generate_sandbox_examples(count: int, progress: Progress) -> Generator[dict, None, None]:
560
+ """Generate synthetic sandbox error-fix examples."""
561
+ task = progress.add_task("[red]sandbox_examples", total=count)
562
+
563
+ error_scenarios = [
564
+ {
565
+ "error": "TypeError: Cannot read properties of undefined (reading 'map')",
566
+ "cause": "Data array is undefined on initial render before API response",
567
+ "fix": "Add optional chaining and fallback: data?.items?.map(...) ?? []",
568
+ "file": "src/components/DataList.tsx",
569
+ },
570
+ {
571
+ "error": "Error: Hydration failed because the initial UI does not match what was rendered on the server",
572
+ "cause": "Using browser-only APIs (window, localStorage) during server render",
573
+ "fix": "Wrap in useEffect or use dynamic import with ssr: false",
574
+ "file": "src/components/ThemeProvider.tsx",
575
+ },
576
+ {
577
+ "error": "Module not found: Can't resolve '@/components/ui/button'",
578
+ "cause": "Path alias not configured in tsconfig.json",
579
+ "fix": "Add paths mapping in tsconfig.json: '@/*': ['./src/*']",
580
+ "file": "tsconfig.json",
581
+ },
582
+ {
583
+ "error": "Warning: Each child in a list should have a unique 'key' prop",
584
+ "cause": "Missing key prop in .map() iteration",
585
+ "fix": "Add key={item.id} to the mapped JSX element",
586
+ "file": "src/components/ItemList.tsx",
587
+ },
588
+ {
589
+ "error": "TypeError: fetch failed - ECONNREFUSED",
590
+ "cause": "API endpoint is unreachable or CORS is not configured",
591
+ "fix": "Use Next.js API route as proxy, add CORS headers",
592
+ "file": "src/app/api/proxy/route.ts",
593
+ },
594
+ {
595
+ "error": "Error: Invalid hook call. Hooks can only be called inside of the body of a function component",
596
+ "cause": "Calling useState inside a conditional or nested function",
597
+ "fix": "Move hook call to the top level of the component function",
598
+ "file": "src/hooks/useAuth.ts",
599
+ },
600
+ {
601
+ "error": "Build error: Type 'string | undefined' is not assignable to type 'string'",
602
+ "cause": "Environment variable might be undefined at build time",
603
+ "fix": "Add non-null assertion or provide default value with ?? ''",
604
+ "file": "src/lib/config.ts",
605
+ },
606
+ {
607
+ "error": "Warning: validateDOMNesting(...): <div> cannot appear as a descendant of <p>",
608
+ "cause": "Invalid HTML nesting - block element inside inline element",
609
+ "fix": "Change outer <p> to <div> or inner <div> to <span>",
610
+ "file": "src/components/Card.tsx",
611
+ },
612
+ {
613
+ "error": "Error: NEXT_REDIRECT in API route",
614
+ "cause": "Using redirect() in a try/catch block catches the redirect error",
615
+ "fix": "Move redirect() call outside of try/catch, or re-throw NEXT_REDIRECT",
616
+ "file": "src/app/api/auth/route.ts",
617
+ },
618
+ {
619
+ "error": "Unhandled Runtime Error: Maximum update depth exceeded",
620
+ "cause": "useEffect dependency causes infinite re-render loop",
621
+ "fix": "Memoize the dependency with useMemo or useCallback",
622
+ "file": "src/hooks/useData.ts",
623
+ },
624
+ ]
625
+
626
+ for i in range(count):
627
+ scenario = error_scenarios[i % len(error_scenarios)]
628
+
629
+ assistant_content = (
630
+ f"<|think_start|>\n"
631
+ f"I see a build error. Let me analyze:\n"
632
+ f"Error: {scenario['error']}\n"
633
+ f"Root cause: {scenario['cause']}\n"
634
+ f"I need to fix this in {scenario['file']}\n"
635
+ f"<|think_end|>\n\n"
636
+ f"<|error_start|>\n"
637
+ f"File: {scenario['file']}\n"
638
+ f"Error: {scenario['error']}\n"
639
+ f"<|error_end|>\n\n"
640
+ f"<|sandbox_start|>\n"
641
+ f"Running: npm run build\n"
642
+ f"Status: FAILED\n"
643
+ f"Exit code: 1\n"
644
+ f"<|sandbox_end|>\n\n"
645
+ f"<|fix_start|>\n"
646
+ f"Root cause: {scenario['cause']}\n"
647
+ f"Solution: {scenario['fix']}\n"
648
+ f"<|fix_end|>\n\n"
649
+ f"<|file_start|>\npath: {scenario['file']}\nlanguage: typescript\n<|file_end|>\n\n"
650
+ f"<|code_start|>\n"
651
+ f"// Fixed version of {scenario['file']}\n"
652
+ f"// Applied fix: {scenario['fix']}\n"
653
+ f"export default function Fixed() {{\n"
654
+ f" // Corrected implementation\n"
655
+ f" return <div>Fixed component</div>;\n"
656
+ f"}}\n"
657
+ f"<|code_end|>\n\n"
658
+ f"<|sandbox_start|>\n"
659
+ f"Running: npm run build\n"
660
+ f"Status: SUCCESS\n"
661
+ f"Exit code: 0\n"
662
+ f"<|sandbox_end|>"
663
+ )
664
+
665
+ yield {
666
+ "id": f"sandbox_{i:06d}",
667
+ "source": "sandbox_examples",
668
+ "user": f"I'm getting this error: {scenario['error']}",
669
+ "assistant": assistant_content,
670
+ "error_type": scenario["error"][:50],
671
+ }
672
+
673
+ progress.update(task, completed=i + 1)
674
+
675
+
676
+ def write_synthetic(
677
+ config: DatasetConfig,
678
+ checkpoint: CheckpointManager,
679
+ progress: Progress,
680
+ ) -> int:
681
+ """Generate and write synthetic data."""
682
+ output_path = DATA_RAW / config.output_file
683
+
684
+ generators = {
685
+ "synthetic_nextjs": generate_synthetic_nextjs,
686
+ "search_examples": generate_search_examples,
687
+ "sandbox_examples": generate_sandbox_examples,
688
+ }
689
+
690
+ gen_fn = generators[config.name]
691
+ count = 0
692
+
693
+ with open(output_path, "w", encoding="utf-8") as f:
694
+ for example in gen_fn(config.target_count, progress):
695
+ f.write(json.dumps(example, ensure_ascii=False) + "\n")
696
+ count += 1
697
+
698
+ size_mb = output_path.stat().st_size / (1024 * 1024)
699
+ log.info(f"βœ… {config.name}: {count:,} examples, {size_mb:.1f} MB")
700
+ return count
701
+
702
+
703
+ # ── Disk space check ──────────────────────────────────────────────────
704
+ def check_disk_space(datasets: list[DatasetConfig]) -> bool:
705
+ """Verify enough disk space for planned downloads."""
706
+ import shutil
707
+
708
+ total_est_gb = sum(d.est_size_gb for d in datasets)
709
+ usage = shutil.disk_usage(str(DATA_RAW))
710
+ free_gb = usage.free / (1024 ** 3)
711
+
712
+ table = Table(title="πŸ’Ύ Disk Space Estimate")
713
+ table.add_column("Item", style="cyan")
714
+ table.add_column("Size", justify="right", style="green")
715
+
716
+ for d in datasets:
717
+ table.add_row(d.name, f"{d.est_size_gb:.2f} GB")
718
+
719
+ table.add_row("─" * 20, "─" * 10, style="dim")
720
+ table.add_row("Total estimated", f"{total_est_gb:.2f} GB", style="bold")
721
+ table.add_row("Available", f"{free_gb:.1f} GB", style="bold green")
722
+ table.add_row(
723
+ "After download",
724
+ f"~{free_gb - total_est_gb:.1f} GB",
725
+ style="bold yellow" if free_gb - total_est_gb > 50 else "bold red",
726
+ )
727
+
728
+ console.print(table)
729
+
730
+ if total_est_gb > free_gb * 0.8:
731
+ log.error(f"Not enough disk space! Need {total_est_gb:.1f} GB, have {free_gb:.1f} GB")
732
+ return False
733
+
734
+ return True
735
+
736
+
737
+ # ── Main pipeline ─────────────────────────────────────────────────────
738
+ def run_pipeline(
739
+ stage: Optional[int] = None,
740
+ dataset_name: Optional[str] = None,
741
+ synthetic_only: bool = False,
742
+ ) -> None:
743
+ """Run the download pipeline."""
744
+ console.print(Panel.fit(
745
+ "[bold cyan]MINDI 1.5 Vision-Coder β€” Dataset Download Pipeline[/]\n"
746
+ "[dim]Day 2 Step 1: Download 500K+ training examples[/]",
747
+ border_style="cyan",
748
+ ))
749
+
750
+ checkpoint = CheckpointManager()
751
+
752
+ # Filter datasets based on args
753
+ if dataset_name:
754
+ targets = [d for d in DATASETS if d.name == dataset_name]
755
+ if not targets:
756
+ log.error(f"Unknown dataset: {dataset_name}. Available: {[d.name for d in DATASETS]}")
757
+ return
758
+ elif synthetic_only:
759
+ targets = [d for d in DATASETS if d.is_synthetic]
760
+ elif stage is not None:
761
+ targets = [d for d in DATASETS if d.stage == stage or (stage == 0 and d.is_synthetic)]
762
+ else:
763
+ targets = DATASETS
764
+
765
+ # Show plan
766
+ plan_table = Table(title="πŸ“‹ Download Plan")
767
+ plan_table.add_column("Dataset", style="cyan")
768
+ plan_table.add_column("Examples", justify="right")
769
+ plan_table.add_column("Est. Size", justify="right")
770
+ plan_table.add_column("Stage")
771
+ plan_table.add_column("Status")
772
+
773
+ for d in targets:
774
+ status = "βœ… Done" if checkpoint.is_complete(d.name) else "⏳ Pending"
775
+ stage_label = f"Stage {d.stage}" if d.stage > 0 else "Synthetic"
776
+ plan_table.add_row(
777
+ d.name,
778
+ f"{d.target_count:,}",
779
+ f"{d.est_size_gb:.2f} GB",
780
+ stage_label,
781
+ status,
782
+ )
783
+
784
+ console.print(plan_table)
785
+
786
+ # Check disk space
787
+ pending = [d for d in targets if not checkpoint.is_complete(d.name)]
788
+ if not pending:
789
+ console.print("\n[bold green]βœ… All requested datasets already downloaded![/]")
790
+ _print_summary(checkpoint)
791
+ return
792
+
793
+ if not check_disk_space(pending):
794
+ return
795
+
796
+ # Download with progress
797
+ console.print()
798
+ with Progress(
799
+ SpinnerColumn(),
800
+ TextColumn("[progress.description]{task.description}"),
801
+ BarColumn(),
802
+ MofNCompleteColumn(),
803
+ TimeElapsedColumn(),
804
+ TimeRemainingColumn(),
805
+ console=console,
806
+ refresh_per_second=2,
807
+ ) as progress:
808
+ for config in pending:
809
+ if checkpoint.is_complete(config.name):
810
+ log.info(f"Skipping {config.name} (already complete)")
811
+ continue
812
+
813
+ log.info(f"\n{'─' * 50}")
814
+ log.info(f"Starting: {config.name} β€” {config.description}")
815
+
816
+ try:
817
+ if config.is_synthetic:
818
+ count = write_synthetic(config, checkpoint, progress)
819
+ else:
820
+ count = download_hf_dataset(config, checkpoint, progress)
821
+
822
+ size_mb = (DATA_RAW / config.output_file).stat().st_size / (1024 * 1024)
823
+ checkpoint.mark_complete(config.name, count, size_mb)
824
+
825
+ except KeyboardInterrupt:
826
+ log.warning(f"\n⚠️ Interrupted during {config.name}. Progress saved β€” rerun to resume.")
827
+ return
828
+ except Exception as e:
829
+ log.error(f"❌ Failed {config.name}: {e}")
830
+ log.error(traceback.format_exc())
831
+ continue
832
+
833
+ _print_summary(checkpoint)
834
+
835
+
836
+ def _print_summary(checkpoint: CheckpointManager) -> None:
837
+ """Print final download summary."""
838
+ console.print()
839
+ summary = Table(title="πŸ“Š Download Summary")
840
+ summary.add_column("Dataset", style="cyan")
841
+ summary.add_column("Examples", justify="right")
842
+ summary.add_column("Size", justify="right")
843
+ summary.add_column("Time")
844
+
845
+ total_count = 0
846
+ total_mb = 0
847
+ for name, info in checkpoint.data["completed"].items():
848
+ summary.add_row(
849
+ name,
850
+ f"{info['count']:,}",
851
+ f"{info['size_mb']:.1f} MB",
852
+ info.get("timestamp", ""),
853
+ )
854
+ total_count += info["count"]
855
+ total_mb += info["size_mb"]
856
+
857
+ summary.add_row("─" * 20, "─" * 10, "─" * 10, "─" * 15, style="dim")
858
+ summary.add_row(
859
+ "[bold]TOTAL[/]",
860
+ f"[bold]{total_count:,}[/]",
861
+ f"[bold]{total_mb:.1f} MB[/]",
862
+ "",
863
+ style="bold green",
864
+ )
865
+
866
+ console.print(summary)
867
+
868
+ if total_count >= 500_000:
869
+ console.print("\n[bold green]πŸŽ‰ TARGET REACHED: 500K+ examples downloaded![/]")
870
+ else:
871
+ remaining = 500_000 - total_count
872
+ console.print(f"\n[yellow]⏳ {remaining:,} more examples needed to reach 500K target[/]")
873
+
874
+
875
+ # ── CLI ───────────────────────────────────────────────────────────────
876
+ def main() -> None:
877
+ parser = argparse.ArgumentParser(description="MINDI Dataset Download Pipeline")
878
+ parser.add_argument("--dataset", type=str, help="Download a specific dataset by name")
879
+ parser.add_argument("--stage", type=int, choices=[0, 1, 2, 3], help="Download a specific stage")
880
+ parser.add_argument("--synthetic", action="store_true", help="Generate synthetic data only")
881
+ args = parser.parse_args()
882
+
883
+ run_pipeline(
884
+ stage=args.stage,
885
+ dataset_name=args.dataset,
886
+ synthetic_only=args.synthetic,
887
+ )
888
+
889
+
890
+ if __name__ == "__main__":
891
+ main()
scripts/process_data.py ADDED
@@ -0,0 +1,820 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MINDI 1.5 Vision-Coder β€” Day 2 Step 2: MINDI Format Converter
3
+
4
+ Converts ALL raw datasets (JSONL) into unified MINDI training format.
5
+
6
+ Each output example:
7
+ {
8
+ "id": "mindi_000001",
9
+ "type": "code_generation",
10
+ "source": "websight",
11
+ "messages": [
12
+ {"role": "system", "content": "..."},
13
+ {"role": "user", "content": "..."},
14
+ {"role": "assistant", "content": "<|think_start|>...<|think_end|>..."}
15
+ ],
16
+ "metadata": {
17
+ "language": "typescript",
18
+ "framework": "nextjs",
19
+ "has_vision": false,
20
+ "tokens": 1024,
21
+ "quality_score": 8.5
22
+ }
23
+ }
24
+
25
+ Usage:
26
+ python scripts/process_data.py # Process all
27
+ python scripts/process_data.py --source codealpaca # Process one
28
+ python scripts/process_data.py --dry-run # Preview only
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import argparse
34
+ import hashlib
35
+ import json
36
+ import logging
37
+ import random
38
+ import re
39
+ import sys
40
+ import time
41
+ from dataclasses import dataclass
42
+ from pathlib import Path
43
+ from typing import Any, Generator, Optional
44
+
45
+ from rich.console import Console
46
+ from rich.logging import RichHandler
47
+ from rich.panel import Panel
48
+ from rich.progress import (
49
+ BarColumn,
50
+ MofNCompleteColumn,
51
+ Progress,
52
+ SpinnerColumn,
53
+ TextColumn,
54
+ TimeElapsedColumn,
55
+ TimeRemainingColumn,
56
+ )
57
+ from rich.table import Table
58
+
59
+ # ── Paths ─────────────────────────────────────────────────────────────
60
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
61
+ DATA_RAW = PROJECT_ROOT / "data" / "raw"
62
+ DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
63
+ LOGS_DIR = PROJECT_ROOT / "logs"
64
+ TOKENIZER_PATH = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"
65
+
66
+ DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
67
+ LOGS_DIR.mkdir(parents=True, exist_ok=True)
68
+
69
+ # ── Logging ───────────────────────────────────────────────────────────
70
+ console = Console()
71
+ logging.basicConfig(
72
+ level=logging.INFO,
73
+ format="%(message)s",
74
+ datefmt="[%X]",
75
+ handlers=[
76
+ RichHandler(console=console, rich_tracebacks=True, show_path=False),
77
+ logging.FileHandler(LOGS_DIR / "process_data.log", encoding="utf-8"),
78
+ ],
79
+ )
80
+ log = logging.getLogger("mindi.process")
81
+
82
+ # ── System prompt ─────────────────────────────────────────────────────
83
+ MINDI_SYSTEM_PROMPT = (
84
+ "You are MINDI 1.5 Vision-Coder, an AI built by MINDIGENOUS.AI. "
85
+ "You are an expert in Next.js 14, React, TypeScript, Tailwind CSS, "
86
+ "and UI/UX design. You see your own output and critique it to make "
87
+ "it better for the user."
88
+ )
89
+
90
+ # ── Tokenizer (lazy loaded) ──────────────────────────────────────────
91
+ _tokenizer = None
92
+
93
+
94
+ def get_tokenizer():
95
+ global _tokenizer
96
+ if _tokenizer is None:
97
+ from transformers import AutoTokenizer
98
+ _tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_PATH), trust_remote_code=True)
99
+ log.info(f"Loaded tokenizer (vocab={len(_tokenizer):,})")
100
+ return _tokenizer
101
+
102
+
103
+ def count_tokens(text: str) -> int:
104
+ tok = get_tokenizer()
105
+ return len(tok.encode(text, add_special_tokens=False))
106
+
107
+
108
+ # ── Language detection ────────────────────────────────────────────────
109
+ def detect_language(code: str, filename: str = "") -> str:
110
+ """Detect programming language from code content or filename."""
111
+ ext_map = {
112
+ ".py": "python", ".js": "javascript", ".jsx": "javascript",
113
+ ".ts": "typescript", ".tsx": "typescript", ".html": "html",
114
+ ".css": "css", ".json": "json", ".md": "markdown",
115
+ ".rs": "rust", ".go": "go", ".java": "java", ".cpp": "cpp",
116
+ ".c": "c", ".rb": "ruby", ".php": "php", ".swift": "swift",
117
+ ".kt": "kotlin", ".sql": "sql", ".sh": "bash",
118
+ }
119
+ if filename:
120
+ ext = Path(filename).suffix.lower()
121
+ if ext in ext_map:
122
+ return ext_map[ext]
123
+
124
+ # Heuristic detection from content
125
+ if "import React" in code or "from 'react'" in code or "jsx" in code.lower():
126
+ return "typescript" if ": " in code and ("interface " in code or "type " in code) else "javascript"
127
+ if "def " in code and "import " in code and ":" in code:
128
+ return "python"
129
+ if "func " in code and "package " in code:
130
+ return "go"
131
+ if "fn " in code and "let mut" in code:
132
+ return "rust"
133
+ if "public class" in code or "public static void" in code:
134
+ return "java"
135
+ if "<!DOCTYPE" in code or "<html" in code:
136
+ return "html"
137
+ if "function " in code or "const " in code or "=>" in code:
138
+ return "javascript"
139
+ return "unknown"
140
+
141
+
142
+ def detect_framework(code: str) -> str:
143
+ """Detect framework from code content."""
144
+ if "'use client'" in code or "next/" in code or "Next" in code:
145
+ return "nextjs"
146
+ if "import React" in code or "from 'react'" in code:
147
+ return "react"
148
+ if "express" in code.lower():
149
+ return "express"
150
+ if "from flask" in code or "Flask(" in code:
151
+ return "flask"
152
+ if "from django" in code:
153
+ return "django"
154
+ if "import vue" in code.lower() or "defineComponent" in code:
155
+ return "vue"
156
+ return "none"
157
+
158
+
159
+ # ── Quality scoring ──────────────────────────────────────────────────
160
+ def score_quality(code: str, language: str) -> float:
161
+ """Score code quality on a 1-10 scale using heuristics."""
162
+ score = 5.0
163
+
164
+ # Length bonus (not too short, not just boilerplate)
165
+ lines = code.strip().splitlines()
166
+ if len(lines) >= 10:
167
+ score += 0.5
168
+ if len(lines) >= 30:
169
+ score += 0.5
170
+ if len(lines) < 3:
171
+ score -= 2.0
172
+
173
+ # Has comments/docstrings
174
+ if "//" in code or "/*" in code or '"""' in code or "'''" in code or "#" in code:
175
+ score += 0.5
176
+
177
+ # Has type annotations (TypeScript/Python)
178
+ if language in ("typescript", "python"):
179
+ if ":" in code and ("interface " in code or "type " in code or "-> " in code):
180
+ score += 0.5
181
+
182
+ # Has proper imports
183
+ if "import " in code or "from " in code or "require(" in code:
184
+ score += 0.3
185
+
186
+ # Has error handling
187
+ if "try" in code or "catch" in code or "except" in code:
188
+ score += 0.3
189
+
190
+ # Has exports (module structure)
191
+ if "export " in code or "module.exports" in code:
192
+ score += 0.3
193
+
194
+ # Penalize very short or empty
195
+ if len(code.strip()) < 50:
196
+ score -= 1.0
197
+
198
+ # Penalize obvious low quality
199
+ if code.count("TODO") > 3 or code.count("FIXME") > 3:
200
+ score -= 0.5
201
+ if "console.log" in code and code.count("console.log") > 5:
202
+ score -= 0.3
203
+
204
+ # Has proper function/class structure
205
+ if "function " in code or "class " in code or "def " in code or "const " in code:
206
+ score += 0.3
207
+
208
+ # Tailwind/CSS usage
209
+ if "className" in code or "tailwind" in code.lower():
210
+ score += 0.3
211
+
212
+ return max(1.0, min(10.0, round(score, 1)))
213
+
214
+
215
+ # ── Converter: wrap code in MINDI format ─────────────────────────────
216
+ def wrap_mindi_assistant(
217
+ code: str,
218
+ language: str = "typescript",
219
+ filename: str = "",
220
+ thinking: str = "",
221
+ critique: str = "",
222
+ suggestions: str = "",
223
+ ) -> str:
224
+ """Wrap code in MINDI special token format."""
225
+ parts = []
226
+
227
+ # Thinking block
228
+ if thinking:
229
+ parts.append(f"<|think_start|>\n{thinking}\n<|think_end|>")
230
+
231
+ # File metadata
232
+ if filename:
233
+ framework = detect_framework(code)
234
+ parts.append(f"<|file_start|>\npath: {filename}\nlanguage: {language}\nframework: {framework}\n<|file_end|>")
235
+
236
+ # Code block
237
+ parts.append(f"<|code_start|>\n{code.strip()}\n<|code_end|>")
238
+
239
+ # Critique
240
+ if critique:
241
+ parts.append(f"<|critique_start|>\n{critique}\n<|critique_end|>")
242
+
243
+ # Suggestions
244
+ if suggestions:
245
+ parts.append(f"<|suggest_start|>\n{suggestions}\n<|suggest_end|>")
246
+
247
+ return "\n\n".join(parts)
248
+
249
+
250
+ def generate_thinking(user_request: str, language: str) -> str:
251
+ """Generate a basic thinking block from the user request."""
252
+ verbs = ["analyze", "implement", "create", "design", "build"]
253
+ verb = random.choice(verbs)
254
+ return (
255
+ f"The user wants me to {verb} something. Let me break this down:\n"
256
+ f"1. Understand the requirements from the request\n"
257
+ f"2. Choose the right approach for {language}\n"
258
+ f"3. Write clean, production-ready code\n"
259
+ f"4. Review for best practices and accessibility"
260
+ )
261
+
262
+
263
+ def generate_critique(language: str, code: str) -> str:
264
+ """Generate a basic code critique."""
265
+ items = [
266
+ "βœ… Code structure: Well-organized with clear separation of concerns",
267
+ "βœ… Naming: Descriptive variable and function names",
268
+ ]
269
+ if language in ("typescript", "javascript"):
270
+ items.append("βœ… Modern syntax: Uses ES6+ features appropriately")
271
+ if "className" in code:
272
+ items.append("βœ… Styling: Tailwind CSS classes used correctly")
273
+ items.append("⚠️ Consider adding error handling for edge cases")
274
+ items.append("⚠️ Could benefit from unit tests")
275
+ return "Code Review:\n" + "\n".join(f"- {item}" for item in items)
276
+
277
+
278
+ def generate_suggestions() -> str:
279
+ """Generate improvement suggestions."""
280
+ pool = [
281
+ "Add comprehensive error handling with try/catch",
282
+ "Implement loading and error states for better UX",
283
+ "Add TypeScript strict mode compliance",
284
+ "Write unit tests with Jest and Testing Library",
285
+ "Add JSDoc comments for public API",
286
+ "Consider extracting reusable hooks",
287
+ "Add proper aria attributes for accessibility",
288
+ "Implement responsive design breakpoints",
289
+ "Add performance optimization with useMemo/useCallback",
290
+ "Consider adding Storybook stories for documentation",
291
+ ]
292
+ selected = random.sample(pool, min(4, len(pool)))
293
+ return "Suggested improvements:\n" + "\n".join(f"{i+1}. {s}" for i, s in enumerate(selected))
294
+
295
+
296
+ # ── Source-specific converters ────────────────────────────────────────
297
+
298
+ def convert_codealpaca(raw: dict, idx: int) -> Optional[dict]:
299
+ """Convert CodeAlpaca example to MINDI format."""
300
+ instruction = raw.get("instruction", "").strip()
301
+ inp = raw.get("input", "").strip()
302
+ output = raw.get("output", "").strip()
303
+
304
+ if not instruction or not output:
305
+ return None
306
+
307
+ user_content = f"{instruction}\n{inp}".strip() if inp else instruction
308
+ language = detect_language(output)
309
+ quality = score_quality(output, language)
310
+
311
+ assistant_content = wrap_mindi_assistant(
312
+ code=output,
313
+ language=language,
314
+ thinking=generate_thinking(instruction, language),
315
+ critique=generate_critique(language, output),
316
+ suggestions=generate_suggestions(),
317
+ )
318
+
319
+ tokens = count_tokens(assistant_content)
320
+
321
+ return {
322
+ "id": f"mindi_{idx:06d}",
323
+ "type": "code_generation",
324
+ "source": "codealpaca",
325
+ "messages": [
326
+ {"role": "system", "content": MINDI_SYSTEM_PROMPT},
327
+ {"role": "user", "content": user_content},
328
+ {"role": "assistant", "content": assistant_content},
329
+ ],
330
+ "metadata": {
331
+ "language": language,
332
+ "framework": detect_framework(output),
333
+ "has_vision": False,
334
+ "tokens": tokens,
335
+ "quality_score": quality,
336
+ },
337
+ }
338
+
339
+
340
+ def convert_codefeedback(raw: dict, idx: int) -> Optional[dict]:
341
+ """Convert CodeFeedback example to MINDI format."""
342
+ query = raw.get("query", "").strip()
343
+ answer = raw.get("answer", "").strip()
344
+
345
+ if not query or not answer:
346
+ return None
347
+
348
+ # Extract code blocks from answer if present
349
+ code_blocks = re.findall(r"```[\w]*\n(.*?)```", answer, re.DOTALL)
350
+ code = "\n\n".join(code_blocks) if code_blocks else answer
351
+
352
+ language = detect_language(code)
353
+ quality = score_quality(code, language)
354
+
355
+ assistant_content = wrap_mindi_assistant(
356
+ code=code,
357
+ language=language,
358
+ thinking=generate_thinking(query, language),
359
+ critique=generate_critique(language, code),
360
+ suggestions=generate_suggestions(),
361
+ )
362
+
363
+ tokens = count_tokens(assistant_content)
364
+
365
+ return {
366
+ "id": f"mindi_{idx:06d}",
367
+ "type": "code_generation",
368
+ "source": "codefeedback",
369
+ "messages": [
370
+ {"role": "system", "content": MINDI_SYSTEM_PROMPT},
371
+ {"role": "user", "content": query},
372
+ {"role": "assistant", "content": assistant_content},
373
+ ],
374
+ "metadata": {
375
+ "language": language,
376
+ "framework": detect_framework(code),
377
+ "has_vision": False,
378
+ "tokens": tokens,
379
+ "quality_score": quality,
380
+ },
381
+ }
382
+
383
+
384
+ def convert_starcoderdata(raw: dict, idx: int) -> Optional[dict]:
385
+ """Convert StarCoder raw code to MINDI instruction format."""
386
+ content = raw.get("content", "").strip()
387
+ if not content or len(content) < 50:
388
+ return None
389
+
390
+ # Extract metadata
391
+ max_lines = raw.get("max_line_length", 0)
392
+ avg_line = raw.get("avg_line_length", 0)
393
+
394
+ language = detect_language(content)
395
+ quality = score_quality(content, language)
396
+
397
+ # Create a synthetic user request from the code
398
+ # Extract first comment or function/class name as context
399
+ first_lines = content[:500]
400
+ if "def " in first_lines:
401
+ match = re.search(r"def (\w+)", first_lines)
402
+ func_name = match.group(1) if match else "function"
403
+ user_request = f"Write a {language} function called `{func_name}` with proper implementation"
404
+ elif "class " in first_lines:
405
+ match = re.search(r"class (\w+)", first_lines)
406
+ class_name = match.group(1) if match else "Class"
407
+ user_request = f"Create a {language} class called `{class_name}` with full implementation"
408
+ elif "function " in first_lines or "const " in first_lines:
409
+ match = re.search(r"(?:function|const)\s+(\w+)", first_lines)
410
+ name = match.group(1) if match else "component"
411
+ user_request = f"Implement `{name}` in {language} with clean, modern code"
412
+ elif "export " in first_lines:
413
+ match = re.search(r"export\s+(?:default\s+)?(?:function|class|const)\s+(\w+)", first_lines)
414
+ name = match.group(1) if match else "module"
415
+ user_request = f"Build an exported {language} module `{name}`"
416
+ else:
417
+ user_request = f"Write this {language} code with best practices"
418
+
419
+ # Detect filename from content hints
420
+ filename = ""
421
+ if language == "python":
422
+ filename = "main.py"
423
+ elif language == "typescript":
424
+ filename = "index.tsx"
425
+ elif language == "javascript":
426
+ filename = "index.js"
427
+
428
+ assistant_content = wrap_mindi_assistant(
429
+ code=content,
430
+ language=language,
431
+ filename=filename,
432
+ thinking=generate_thinking(user_request, language),
433
+ critique=generate_critique(language, content),
434
+ suggestions=generate_suggestions(),
435
+ )
436
+
437
+ tokens = count_tokens(assistant_content)
438
+
439
+ return {
440
+ "id": f"mindi_{idx:06d}",
441
+ "type": "code_generation",
442
+ "source": "starcoderdata",
443
+ "messages": [
444
+ {"role": "system", "content": MINDI_SYSTEM_PROMPT},
445
+ {"role": "user", "content": user_request},
446
+ {"role": "assistant", "content": assistant_content},
447
+ ],
448
+ "metadata": {
449
+ "language": language,
450
+ "framework": detect_framework(content),
451
+ "has_vision": False,
452
+ "tokens": tokens,
453
+ "quality_score": quality,
454
+ },
455
+ }
456
+
457
+
458
+ def convert_websight(raw: dict, idx: int) -> Optional[dict]:
459
+ """Convert WebSight HTML+screenshot to MINDI format."""
460
+ html = raw.get("text", "").strip()
461
+ if not html:
462
+ return None
463
+
464
+ # WebSight has HTML β€” we keep it as-is (conversion to JSX is a training objective)
465
+ language = "html"
466
+ quality = score_quality(html, language)
467
+ has_image = "image" in raw or "screenshot" in raw
468
+
469
+ user_request = "Convert this webpage design into a modern Next.js 14 component with Tailwind CSS"
470
+
471
+ thinking = (
472
+ "The user wants me to convert a web design to Next.js. I need to:\n"
473
+ "1. Analyze the HTML structure and visual layout\n"
474
+ "2. Convert HTML elements to React JSX syntax\n"
475
+ "3. Replace CSS classes with Tailwind CSS utilities\n"
476
+ "4. Add TypeScript types and proper component structure\n"
477
+ "5. Ensure responsive design and accessibility"
478
+ )
479
+
480
+ assistant_content = wrap_mindi_assistant(
481
+ code=html,
482
+ language="typescript",
483
+ filename="src/components/ConvertedPage.tsx",
484
+ thinking=thinking,
485
+ critique=generate_critique("typescript", html),
486
+ suggestions=generate_suggestions(),
487
+ )
488
+
489
+ tokens = count_tokens(assistant_content)
490
+
491
+ return {
492
+ "id": f"mindi_{idx:06d}",
493
+ "type": "vision_code",
494
+ "source": "websight",
495
+ "messages": [
496
+ {"role": "system", "content": MINDI_SYSTEM_PROMPT},
497
+ {"role": "user", "content": user_request},
498
+ {"role": "assistant", "content": assistant_content},
499
+ ],
500
+ "metadata": {
501
+ "language": "typescript",
502
+ "framework": "nextjs",
503
+ "has_vision": has_image,
504
+ "tokens": tokens,
505
+ "quality_score": quality,
506
+ },
507
+ }
508
+
509
+
510
+ def convert_synthetic(raw: dict, idx: int) -> Optional[dict]:
511
+ """Convert synthetic data (already in near-MINDI format) to final format."""
512
+ user_content = raw.get("user", "").strip()
513
+ assistant_content = raw.get("assistant", "").strip()
514
+ source = raw.get("source", "synthetic")
515
+
516
+ if not user_content or not assistant_content:
517
+ return None
518
+
519
+ tokens = count_tokens(assistant_content)
520
+ language = raw.get("language", "typescript")
521
+
522
+ return {
523
+ "id": f"mindi_{idx:06d}",
524
+ "type": "code_generation" if "search" not in source else "search",
525
+ "source": source,
526
+ "messages": [
527
+ {"role": "system", "content": MINDI_SYSTEM_PROMPT},
528
+ {"role": "user", "content": user_content},
529
+ {"role": "assistant", "content": assistant_content},
530
+ ],
531
+ "metadata": {
532
+ "language": language,
533
+ "framework": raw.get("framework", "nextjs"),
534
+ "has_vision": False,
535
+ "tokens": tokens,
536
+ "quality_score": score_quality(assistant_content, language),
537
+ },
538
+ }
539
+
540
+
541
+ def convert_evol_code(raw: dict, idx: int) -> Optional[dict]:
542
+ """Convert EvolInstruct-Code example to MINDI format."""
543
+ instruction = raw.get("instruction", "").strip()
544
+ output = raw.get("output", "").strip()
545
+
546
+ if not instruction or not output:
547
+ return None
548
+
549
+ code_blocks = re.findall(r"```[\w]*\n(.*?)```", output, re.DOTALL)
550
+ code = "\n\n".join(code_blocks) if code_blocks else output
551
+
552
+ language = detect_language(code)
553
+ quality = score_quality(code, language)
554
+
555
+ assistant_content = wrap_mindi_assistant(
556
+ code=code,
557
+ language=language,
558
+ thinking=generate_thinking(instruction, language),
559
+ critique=generate_critique(language, code),
560
+ suggestions=generate_suggestions(),
561
+ )
562
+
563
+ tokens = count_tokens(assistant_content)
564
+
565
+ return {
566
+ "id": f"mindi_{idx:06d}",
567
+ "type": "code_generation",
568
+ "source": "evol_code",
569
+ "messages": [
570
+ {"role": "system", "content": MINDI_SYSTEM_PROMPT},
571
+ {"role": "user", "content": instruction},
572
+ {"role": "assistant", "content": assistant_content},
573
+ ],
574
+ "metadata": {
575
+ "language": language,
576
+ "framework": detect_framework(code),
577
+ "has_vision": False,
578
+ "tokens": tokens,
579
+ "quality_score": quality,
580
+ },
581
+ }
582
+
583
+
584
+ def convert_magicoder(raw: dict, idx: int) -> Optional[dict]:
585
+ """Convert Magicoder example to MINDI format."""
586
+ # Magicoder uses problem/solution or instruction/response
587
+ instruction = (raw.get("instruction", "") or raw.get("problem", "")).strip()
588
+ output = (raw.get("response", "") or raw.get("solution", "")).strip()
589
+
590
+ if not instruction or not output:
591
+ return None
592
+
593
+ code_blocks = re.findall(r"```[\w]*\n(.*?)```", output, re.DOTALL)
594
+ code = "\n\n".join(code_blocks) if code_blocks else output
595
+
596
+ language = detect_language(code)
597
+ quality = score_quality(code, language)
598
+
599
+ assistant_content = wrap_mindi_assistant(
600
+ code=code,
601
+ language=language,
602
+ thinking=generate_thinking(instruction, language),
603
+ critique=generate_critique(language, code),
604
+ suggestions=generate_suggestions(),
605
+ )
606
+
607
+ tokens = count_tokens(assistant_content)
608
+
609
+ return {
610
+ "id": f"mindi_{idx:06d}",
611
+ "type": "code_generation",
612
+ "source": "magicoder",
613
+ "messages": [
614
+ {"role": "system", "content": MINDI_SYSTEM_PROMPT},
615
+ {"role": "user", "content": instruction},
616
+ {"role": "assistant", "content": assistant_content},
617
+ ],
618
+ "metadata": {
619
+ "language": language,
620
+ "framework": detect_framework(code),
621
+ "has_vision": False,
622
+ "tokens": tokens,
623
+ "quality_score": quality,
624
+ },
625
+ }
626
+
627
+
628
+ # ── Source registry ───────────────────────────────────────────────────
629
+ SOURCE_CONVERTERS = {
630
+ "codealpaca": ("codealpaca.jsonl", convert_codealpaca),
631
+ "codefeedback": ("codefeedback.jsonl", convert_codefeedback),
632
+ "starcoder_python": ("starcoder_python.jsonl", convert_starcoderdata),
633
+ "starcoder_javascript": ("starcoder_javascript.jsonl", convert_starcoderdata),
634
+ "starcoder_typescript": ("starcoder_typescript.jsonl", convert_starcoderdata),
635
+ "starcoder_css": ("starcoder_css.jsonl", convert_starcoderdata),
636
+ "starcoder_html": ("starcoder_html.jsonl", convert_starcoderdata),
637
+ "evol_code": ("evol_code.jsonl", convert_evol_code),
638
+ "magicoder": ("magicoder.jsonl", convert_magicoder),
639
+ "websight": ("websight.jsonl", convert_websight),
640
+ "synthetic_nextjs": ("synthetic_nextjs.jsonl", convert_synthetic),
641
+ "search_examples": ("search_examples.jsonl", convert_synthetic),
642
+ "sandbox_examples": ("sandbox_examples.jsonl", convert_synthetic),
643
+ }
644
+
645
+ OUTPUT_FILE = DATA_PROCESSED / "mindi_all.jsonl"
646
+
647
+
648
+ # ── Main processing pipeline ─────────────────────────────────────────
649
+ def process_source(
650
+ source_name: str,
651
+ global_idx: int,
652
+ progress: Progress,
653
+ dry_run: bool = False,
654
+ ) -> tuple[int, int, int]:
655
+ """Process one source, return (converted, skipped, global_idx)."""
656
+ if source_name not in SOURCE_CONVERTERS:
657
+ log.error(f"Unknown source: {source_name}")
658
+ return 0, 0, global_idx
659
+
660
+ filename, converter = SOURCE_CONVERTERS[source_name]
661
+ input_path = DATA_RAW / filename
662
+
663
+ if not input_path.exists():
664
+ log.warning(f"⏭️ Skipping {source_name}: {input_path} not found (download first)")
665
+ return 0, 0, global_idx
666
+
667
+ # Count lines for progress
668
+ total_lines = sum(1 for _ in open(input_path, encoding="utf-8"))
669
+ task = progress.add_task(f"[cyan]{source_name}", total=total_lines)
670
+
671
+ converted = 0
672
+ skipped = 0
673
+ output_handle = None
674
+
675
+ if not dry_run:
676
+ # Append mode so we can process sources incrementally
677
+ output_handle = open(OUTPUT_FILE, "a", encoding="utf-8")
678
+
679
+ try:
680
+ with open(input_path, "r", encoding="utf-8") as f:
681
+ for line_num, line in enumerate(f):
682
+ line = line.strip()
683
+ if not line:
684
+ progress.update(task, advance=1)
685
+ continue
686
+
687
+ try:
688
+ raw = json.loads(line)
689
+ except json.JSONDecodeError:
690
+ skipped += 1
691
+ progress.update(task, advance=1)
692
+ continue
693
+
694
+ result = converter(raw, global_idx)
695
+
696
+ if result is None:
697
+ skipped += 1
698
+ else:
699
+ if not dry_run and output_handle:
700
+ output_handle.write(json.dumps(result, ensure_ascii=False) + "\n")
701
+ converted += 1
702
+ global_idx += 1
703
+
704
+ progress.update(task, advance=1)
705
+
706
+ # Flush periodically
707
+ if not dry_run and output_handle and converted % 5000 == 0:
708
+ output_handle.flush()
709
+
710
+ finally:
711
+ if output_handle:
712
+ output_handle.close()
713
+
714
+ log.info(f"{'[DRY RUN] ' if dry_run else ''}βœ… {source_name}: {converted:,} converted, {skipped:,} skipped")
715
+ return converted, skipped, global_idx
716
+
717
+
718
+ def run_processing(
719
+ source: Optional[str] = None,
720
+ dry_run: bool = False,
721
+ ) -> None:
722
+ """Run the full processing pipeline."""
723
+ console.print(Panel.fit(
724
+ "[bold cyan]MINDI 1.5 Vision-Coder β€” MINDI Format Converter[/]\n"
725
+ "[dim]Day 2 Step 2: Convert raw datasets to MINDI training format[/]",
726
+ border_style="cyan",
727
+ ))
728
+
729
+ # Determine sources to process
730
+ if source:
731
+ sources = [source]
732
+ else:
733
+ sources = list(SOURCE_CONVERTERS.keys())
734
+
735
+ # Show available files
736
+ available_table = Table(title="πŸ“ Raw Data Files")
737
+ available_table.add_column("Source", style="cyan")
738
+ available_table.add_column("File")
739
+ available_table.add_column("Exists")
740
+ available_table.add_column("Size")
741
+
742
+ for src in sources:
743
+ fname, _ = SOURCE_CONVERTERS[src]
744
+ fpath = DATA_RAW / fname
745
+ exists = fpath.exists()
746
+ size = f"{fpath.stat().st_size / (1024*1024):.1f} MB" if exists else "β€”"
747
+ available_table.add_row(src, fname, "βœ…" if exists else "❌", size)
748
+
749
+ console.print(available_table)
750
+
751
+ # Count existing examples in output file to resume from correct ID
752
+ existing_count = 0
753
+ if OUTPUT_FILE.exists():
754
+ with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
755
+ existing_count = sum(1 for _ in f)
756
+ log.info(f"πŸ“„ Existing mindi_all.jsonl has {existing_count:,} examples β€” appending new data")
757
+
758
+ # Process each source
759
+ total_converted = 0
760
+ total_skipped = 0
761
+ global_idx = existing_count
762
+
763
+ with Progress(
764
+ SpinnerColumn(),
765
+ TextColumn("[progress.description]{task.description}"),
766
+ BarColumn(),
767
+ MofNCompleteColumn(),
768
+ TimeElapsedColumn(),
769
+ TimeRemainingColumn(),
770
+ console=console,
771
+ refresh_per_second=2,
772
+ ) as progress:
773
+ for src in sources:
774
+ converted, skipped, global_idx = process_source(
775
+ src, global_idx, progress, dry_run=dry_run
776
+ )
777
+ total_converted += converted
778
+ total_skipped += skipped
779
+
780
+ # Summary
781
+ console.print()
782
+ summary = Table(title="πŸ“Š Processing Summary")
783
+ summary.add_column("Metric", style="cyan")
784
+ summary.add_column("Value", justify="right", style="green")
785
+
786
+ summary.add_row("Previously existing", f"{existing_count:,}")
787
+ summary.add_row("Newly converted", f"{total_converted:,}")
788
+ summary.add_row("Total skipped", f"{total_skipped:,}")
789
+ grand_total = existing_count + total_converted
790
+ summary.add_row("[bold]Grand total[/]", f"[bold]{grand_total:,}[/]")
791
+ summary.add_row("Global ID range", f"mindi_000000 β†’ mindi_{global_idx - 1:06d}")
792
+
793
+ if not dry_run and OUTPUT_FILE.exists():
794
+ size_mb = OUTPUT_FILE.stat().st_size / (1024 * 1024)
795
+ summary.add_row("Output file", str(OUTPUT_FILE.relative_to(PROJECT_ROOT)))
796
+ summary.add_row("Output size", f"{size_mb:.1f} MB")
797
+
798
+ console.print(summary)
799
+
800
+ if grand_total >= 500_000:
801
+ console.print("\n[bold green]πŸŽ‰ TARGET REACHED: 500K+ examples in MINDI format![/]")
802
+ elif grand_total > 0:
803
+ remaining = 500_000 - grand_total
804
+ console.print(f"\n[yellow]⏳ {grand_total:,} total examples ({remaining:,} more needed for 500K target)[/]")
805
+ else:
806
+ console.print("\n[yellow]⚠️ No examples converted β€” download raw data first (scripts/download_datasets.py)[/]")
807
+
808
+
809
+ # ── CLI ───────────────────────────────────────────────────────────────
810
+ def main() -> None:
811
+ parser = argparse.ArgumentParser(description="MINDI Format Converter")
812
+ parser.add_argument("--source", type=str, help="Process a specific source only")
813
+ parser.add_argument("--dry-run", action="store_true", help="Preview without writing output")
814
+ args = parser.parse_args()
815
+
816
+ run_processing(source=args.source, dry_run=args.dry_run)
817
+
818
+
819
+ if __name__ == "__main__":
820
+ main()