| |
| """ |
| Upload ENTIRE MINDI 1.5 Vision-Coder project to HuggingFace. |
| |
| REPO 1 (model): Mindigenous/MINDI-1.5-Vision-Coder |
| REPO 2 (dataset): Mindigenous/MINDI-1.5-training-data |
| |
| Both private. On MI300X we will clone these repos directly. |
| """ |
|
|
| import os |
| import sys |
| import time |
| from pathlib import Path |
|
|
| from dotenv import load_dotenv |
| from huggingface_hub import HfApi, create_repo |
|
|
| |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent |
| ENV_FILE = PROJECT_ROOT / ".env" |
|
|
| |
| MODEL_REPO = "Mindigenous/MINDI-1.5-Vision-Coder" |
| DATASET_REPO = "Mindigenous/MINDI-1.5-training-data" |
|
|
| |
| MODEL_CARD = """\ |
| --- |
| license: apache-2.0 |
| language: |
| - en |
| tags: |
| - code-generation |
| - nextjs |
| - react |
| - typescript |
| - vision |
| - multimodal |
| - mindi |
| - mindigenous |
| base_model: Qwen/Qwen2.5-Coder-7B-Instruct |
| --- |
| |
| # MINDI 1.5 Vision-Coder |
| |
| Built by MINDIGENOUS.AI |
| |
| ## Model Description |
| MINDI 1.5 is an agentic AI coding model |
| that sees its own output and critiques it. |
| |
| ## Key Features |
| - Generates Next.js 14 + Tailwind + TypeScript |
| - Sees screenshots via CLIP ViT-L/14 |
| - Critiques its own UI/UX output |
| - Searches internet for latest packages |
| - Tests code in sandbox environment |
| - Self-fixes errors automatically |
| |
| ## Training |
| - Base: Qwen/Qwen2.5-Coder-7B-Instruct |
| - Method: LoRA fine-tuning |
| - Hardware: AMD MI300X 192GB VRAM |
| - Dataset: 1,449,428 examples |
| - Tokens: 859,694,776 |
| - Status: Training in progress |
| |
| ## Built By |
| Faaz - MINDIGENOUS.AI |
| Mumbai, India |
| April 2026 |
| """ |
|
|
| |
| DATASET_CARD = """\ |
| --- |
| license: apache-2.0 |
| language: |
| - en |
| tags: |
| - code-generation |
| - nextjs |
| - react |
| - typescript |
| - vision |
| - multimodal |
| - mindi |
| - mindigenous |
| size_categories: |
| - 1M<n<10M |
| --- |
| |
| # MINDI 1.5 Training Data |
| |
| Training dataset for **MINDI 1.5 Vision-Coder** by MINDIGENOUS.AI |
| |
| ## Dataset Statistics |
| | Metric | Value | |
| |--------|-------| |
| | Total examples | 1,449,428 | |
| | Total tokens | 859,694,776 | |
| | Avg tokens/example | 593 | |
| | Avg quality score | 6.49 | |
| | Sources | 9 | |
| |
| ## Splits |
| | Split | Examples | Percentage | |
| |-------|----------|------------| |
| | Train | 1,304,486 | 90.0% | |
| | Validation | 72,471 | 5.0% | |
| | Test | 72,471 | 5.0% | |
| |
| ## Sources |
| | Source | Examples | Kept % | |
| |--------|----------|--------| |
| | starcoderdata | 569,350 | 94.9% | |
| | websight | 250,987 | 99.99% | |
| | evol_code | 155,998 | 99.7% | |
| | codefeedback | 149,865 | 99.9% | |
| | magicoder | 149,987 | 99.99% | |
| | synthetic_nextjs | 90,000 | 100% (protected) | |
| | codealpaca | 59,241 | 98.8% | |
| | search_examples | 15,000 | 100% (protected) | |
| | sandbox_examples | 9,000 | 100% (protected) | |
| |
| ## Type Distribution |
| | Type | Examples | |
| |------|----------| |
| | code_generation | 1,183,441 | |
| | vision_code | 250,987 | |
| | search | 15,000 | |
| |
| ## Language Distribution |
| | Language | Examples | |
| |----------|----------| |
| | unknown | 490,305 | |
| | typescript | 375,859 | |
| | javascript | 298,497 | |
| | python | 211,842 | |
| | html | 36,371 | |
| | java | 32,458 | |
| | rust | 3,709 | |
| | go | 387 | |
| |
| ## Format |
| Each example is a JSON object with: |
| - `conversations`: list of `{"role": ..., "content": ...}` turns |
| - `source`: dataset origin |
| - `type`: code_generation / vision_code / search |
| - `language`: programming language |
| - `quality_score`: heuristic quality (0-10+) |
| - `token_count`: number of tokens |
| |
| ## Quality Filtering |
| - Protected sources (sandbox, search, synthetic_nextjs) bypass aggressive filters |
| - MINDI special token bonuses boost agentic examples |
| - Dedup via SHA-256 content hashing |
| - Rejection reasons: too_many_tokens (30,637), boilerplate (1,373), duplicate (59) |
| |
| ## Built By |
| Faaz - MINDIGENOUS.AI |
| Mumbai, India β April 2026 |
| """ |
|
|
|
|
| |
| def load_token() -> str: |
| """Load HF token from .env.""" |
| load_dotenv(ENV_FILE) |
| token = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN") |
| if not token: |
| print("ERROR: No HUGGINGFACE_TOKEN or HF_TOKEN found in .env") |
| sys.exit(1) |
| return token |
|
|
|
|
| def ensure_repo(api: HfApi, repo_id: str, repo_type: str, token: str): |
| """Create repo if it doesn't exist.""" |
| try: |
| create_repo( |
| repo_id=repo_id, |
| repo_type=repo_type, |
| private=True, |
| token=token, |
| exist_ok=True, |
| ) |
| print(f" Repo ready: {repo_id} ({repo_type})") |
| except Exception as e: |
| print(f" Repo create/check: {e}") |
|
|
|
|
| def upload_folder(api: HfApi, local: Path, remote: str, repo_id: str, |
| repo_type: str, token: str): |
| """Upload a local folder to HF repo.""" |
| if not local.exists(): |
| print(f" SKIP (not found): {local}") |
| return |
| label = str(local.relative_to(PROJECT_ROOT)) |
| print(f" Uploading {label}/ to {repo_type} repo ... ", end="", flush=True) |
| t0 = time.time() |
| api.upload_folder( |
| repo_id=repo_id, |
| repo_type=repo_type, |
| folder_path=str(local), |
| path_in_repo=remote, |
| token=token, |
| ignore_patterns=["__pycache__", "*.pyc", ".git"], |
| ) |
| print(f"done ({time.time() - t0:.1f}s)") |
|
|
|
|
| def upload_file(api: HfApi, local: Path, remote: str, repo_id: str, |
| repo_type: str, token: str): |
| """Upload a single file to HF repo.""" |
| if not local.exists(): |
| print(f" SKIP (not found): {local.name}") |
| return |
| size_mb = local.stat().st_size / (1024 * 1024) |
| label = str(local.relative_to(PROJECT_ROOT)) |
| print(f" Uploading {label} ({size_mb:.1f} MB) to {repo_type} repo ... ", |
| end="", flush=True) |
| t0 = time.time() |
| api.upload_file( |
| repo_id=repo_id, |
| repo_type=repo_type, |
| path_or_fileobj=str(local), |
| path_in_repo=remote, |
| token=token, |
| ) |
| print(f"done ({time.time() - t0:.1f}s)") |
|
|
|
|
| def upload_readme(api: HfApi, content: str, repo_id: str, |
| repo_type: str, token: str): |
| """Upload a README.md string to a repo.""" |
| print(f" Uploading README.md to {repo_type} repo ... ", end="", flush=True) |
| api.upload_file( |
| repo_id=repo_id, |
| repo_type=repo_type, |
| path_or_fileobj=content.encode("utf-8"), |
| path_in_repo="README.md", |
| token=token, |
| ) |
| print("done") |
|
|
|
|
| |
| def main(): |
| print("=" * 60) |
| print(" MINDI 1.5 β Upload Everything to HuggingFace") |
| print("=" * 60) |
| print() |
|
|
| token = load_token() |
| api = HfApi() |
|
|
| |
| print("[1/4] Creating repositories ...") |
| ensure_repo(api, MODEL_REPO, "model", token) |
| ensure_repo(api, DATASET_REPO, "dataset", token) |
| print() |
|
|
| |
| print("[2/4] Uploading to MODEL repo:", MODEL_REPO) |
| print("-" * 50) |
|
|
| |
| model_folders = [ |
| (PROJECT_ROOT / "src", "src"), |
| (PROJECT_ROOT / "scripts", "scripts"), |
| (PROJECT_ROOT / "configs", "configs"), |
| (PROJECT_ROOT / "data" / "tokenizer", "data/tokenizer"), |
| (PROJECT_ROOT / "tests", "tests"), |
| (PROJECT_ROOT / "api", "api"), |
| ] |
| for local, remote in model_folders: |
| upload_folder(api, local, remote, MODEL_REPO, "model", token) |
|
|
| |
| model_files = [ |
| (PROJECT_ROOT / "requirements.txt", "requirements.txt"), |
| (PROJECT_ROOT / "setup.py", "setup.py"), |
| (PROJECT_ROOT / "activate_mindi.bat", "activate_mindi.bat"), |
| (PROJECT_ROOT / ".env.example", ".env.example"), |
| ] |
| for local, remote in model_files: |
| upload_file(api, local, remote, MODEL_REPO, "model", token) |
|
|
| |
| mi300x_sh = PROJECT_ROOT / "setup_mi300x.sh" |
| if mi300x_sh.exists(): |
| upload_file(api, mi300x_sh, "setup_mi300x.sh", MODEL_REPO, "model", token) |
|
|
| |
| upload_readme(api, MODEL_CARD, MODEL_REPO, "model", token) |
| print() |
|
|
| |
| print("[3/4] Uploading to DATASET repo:", DATASET_REPO) |
| print("-" * 50) |
|
|
| processed = PROJECT_ROOT / "data" / "processed" |
| dataset_files = [ |
| (processed / "train.jsonl", "processed/train.jsonl"), |
| (processed / "val.jsonl", "processed/val.jsonl"), |
| (processed / "test.jsonl", "processed/test.jsonl"), |
| (processed / "mindi_filtered.jsonl", "processed/mindi_filtered.jsonl"), |
| (processed / "filter_report.json", "processed/filter_report.json"), |
| (processed / "split_meta.json", "processed/split_meta.json"), |
| ] |
| for local, remote in dataset_files: |
| upload_file(api, local, remote, DATASET_REPO, "dataset", token) |
|
|
| |
| upload_folder( |
| api, PROJECT_ROOT / "data" / "raw", "raw", |
| DATASET_REPO, "dataset", token, |
| ) |
|
|
| |
| upload_folder( |
| api, PROJECT_ROOT / "data" / "tokenizer", "tokenizer", |
| DATASET_REPO, "dataset", token, |
| ) |
|
|
| |
| upload_readme(api, DATASET_CARD, DATASET_REPO, "dataset", token) |
| print() |
|
|
| |
| print("[4/4] Upload complete!") |
| print() |
| print("ββββββββββββββββββββββββββββββββββββββββ") |
| print("β UPLOAD COMPLETE! β") |
| print("β β") |
| print("β Model repo: β") |
| print("β huggingface.co/Mindigenous/ β") |
| print("β MINDI-1.5-Vision-Coder β") |
| print("β β") |
| print("β Dataset repo: β") |
| print("β huggingface.co/datasets/ β") |
| print("β Mindigenous/MINDI-1.5-training-data β") |
| print("β β") |
| print("β On MI300X just run: β") |
| print("β git clone https://huggingface.co/ β") |
| print("β Mindigenous/MINDI-1.5-Vision-Coder β") |
| print("β β") |
| print("β Ready to train! π β") |
| print("ββββββββββββββββββββββββββββββββββββββββ") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|