MINDI-1.5-Vision-Coder / scripts /upload_everything_to_hf.py
Faaz
Day 3 COMPLETE: Full model architecture
2ff5c54
#!/usr/bin/env python3
"""
Upload ENTIRE MINDI 1.5 Vision-Coder project to HuggingFace.
REPO 1 (model): Mindigenous/MINDI-1.5-Vision-Coder
REPO 2 (dataset): Mindigenous/MINDI-1.5-training-data
Both private. On MI300X we will clone these repos directly.
"""
import os
import sys
import time
from pathlib import Path
from dotenv import load_dotenv
from huggingface_hub import HfApi, create_repo
# ── Paths ──────────────────────────────────────────────────────────────
PROJECT_ROOT = Path(__file__).resolve().parent.parent
ENV_FILE = PROJECT_ROOT / ".env"
# ── Repo names ─────────────────────────────────────────────────────────
MODEL_REPO = "Mindigenous/MINDI-1.5-Vision-Coder"
DATASET_REPO = "Mindigenous/MINDI-1.5-training-data"
# ── Model card (written to repo as README.md) ─────────────────────────
MODEL_CARD = """\
---
license: apache-2.0
language:
- en
tags:
- code-generation
- nextjs
- react
- typescript
- vision
- multimodal
- mindi
- mindigenous
base_model: Qwen/Qwen2.5-Coder-7B-Instruct
---
# MINDI 1.5 Vision-Coder
Built by MINDIGENOUS.AI
## Model Description
MINDI 1.5 is an agentic AI coding model
that sees its own output and critiques it.
## Key Features
- Generates Next.js 14 + Tailwind + TypeScript
- Sees screenshots via CLIP ViT-L/14
- Critiques its own UI/UX output
- Searches internet for latest packages
- Tests code in sandbox environment
- Self-fixes errors automatically
## Training
- Base: Qwen/Qwen2.5-Coder-7B-Instruct
- Method: LoRA fine-tuning
- Hardware: AMD MI300X 192GB VRAM
- Dataset: 1,449,428 examples
- Tokens: 859,694,776
- Status: Training in progress
## Built By
Faaz - MINDIGENOUS.AI
Mumbai, India
April 2026
"""
# ── Dataset card ───────────────────────────────────────────────────────
DATASET_CARD = """\
---
license: apache-2.0
language:
- en
tags:
- code-generation
- nextjs
- react
- typescript
- vision
- multimodal
- mindi
- mindigenous
size_categories:
- 1M<n<10M
---
# MINDI 1.5 Training Data
Training dataset for **MINDI 1.5 Vision-Coder** by MINDIGENOUS.AI
## Dataset Statistics
| Metric | Value |
|--------|-------|
| Total examples | 1,449,428 |
| Total tokens | 859,694,776 |
| Avg tokens/example | 593 |
| Avg quality score | 6.49 |
| Sources | 9 |
## Splits
| Split | Examples | Percentage |
|-------|----------|------------|
| Train | 1,304,486 | 90.0% |
| Validation | 72,471 | 5.0% |
| Test | 72,471 | 5.0% |
## Sources
| Source | Examples | Kept % |
|--------|----------|--------|
| starcoderdata | 569,350 | 94.9% |
| websight | 250,987 | 99.99% |
| evol_code | 155,998 | 99.7% |
| codefeedback | 149,865 | 99.9% |
| magicoder | 149,987 | 99.99% |
| synthetic_nextjs | 90,000 | 100% (protected) |
| codealpaca | 59,241 | 98.8% |
| search_examples | 15,000 | 100% (protected) |
| sandbox_examples | 9,000 | 100% (protected) |
## Type Distribution
| Type | Examples |
|------|----------|
| code_generation | 1,183,441 |
| vision_code | 250,987 |
| search | 15,000 |
## Language Distribution
| Language | Examples |
|----------|----------|
| unknown | 490,305 |
| typescript | 375,859 |
| javascript | 298,497 |
| python | 211,842 |
| html | 36,371 |
| java | 32,458 |
| rust | 3,709 |
| go | 387 |
## Format
Each example is a JSON object with:
- `conversations`: list of `{"role": ..., "content": ...}` turns
- `source`: dataset origin
- `type`: code_generation / vision_code / search
- `language`: programming language
- `quality_score`: heuristic quality (0-10+)
- `token_count`: number of tokens
## Quality Filtering
- Protected sources (sandbox, search, synthetic_nextjs) bypass aggressive filters
- MINDI special token bonuses boost agentic examples
- Dedup via SHA-256 content hashing
- Rejection reasons: too_many_tokens (30,637), boilerplate (1,373), duplicate (59)
## Built By
Faaz - MINDIGENOUS.AI
Mumbai, India β€” April 2026
"""
# ────────────────────────────────────────────────────────────────────────
def load_token() -> str:
"""Load HF token from .env."""
load_dotenv(ENV_FILE)
token = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
if not token:
print("ERROR: No HUGGINGFACE_TOKEN or HF_TOKEN found in .env")
sys.exit(1)
return token
def ensure_repo(api: HfApi, repo_id: str, repo_type: str, token: str):
"""Create repo if it doesn't exist."""
try:
create_repo(
repo_id=repo_id,
repo_type=repo_type,
private=True,
token=token,
exist_ok=True,
)
print(f" Repo ready: {repo_id} ({repo_type})")
except Exception as e:
print(f" Repo create/check: {e}")
def upload_folder(api: HfApi, local: Path, remote: str, repo_id: str,
repo_type: str, token: str):
"""Upload a local folder to HF repo."""
if not local.exists():
print(f" SKIP (not found): {local}")
return
label = str(local.relative_to(PROJECT_ROOT))
print(f" Uploading {label}/ to {repo_type} repo ... ", end="", flush=True)
t0 = time.time()
api.upload_folder(
repo_id=repo_id,
repo_type=repo_type,
folder_path=str(local),
path_in_repo=remote,
token=token,
ignore_patterns=["__pycache__", "*.pyc", ".git"],
)
print(f"done ({time.time() - t0:.1f}s)")
def upload_file(api: HfApi, local: Path, remote: str, repo_id: str,
repo_type: str, token: str):
"""Upload a single file to HF repo."""
if not local.exists():
print(f" SKIP (not found): {local.name}")
return
size_mb = local.stat().st_size / (1024 * 1024)
label = str(local.relative_to(PROJECT_ROOT))
print(f" Uploading {label} ({size_mb:.1f} MB) to {repo_type} repo ... ",
end="", flush=True)
t0 = time.time()
api.upload_file(
repo_id=repo_id,
repo_type=repo_type,
path_or_fileobj=str(local),
path_in_repo=remote,
token=token,
)
print(f"done ({time.time() - t0:.1f}s)")
def upload_readme(api: HfApi, content: str, repo_id: str,
repo_type: str, token: str):
"""Upload a README.md string to a repo."""
print(f" Uploading README.md to {repo_type} repo ... ", end="", flush=True)
api.upload_file(
repo_id=repo_id,
repo_type=repo_type,
path_or_fileobj=content.encode("utf-8"),
path_in_repo="README.md",
token=token,
)
print("done")
# ────────────────────────────────────────────────────────────────────────
def main():
print("=" * 60)
print(" MINDI 1.5 β€” Upload Everything to HuggingFace")
print("=" * 60)
print()
token = load_token()
api = HfApi()
# ── Create repos ───────────────────────────────────────────────
print("[1/4] Creating repositories ...")
ensure_repo(api, MODEL_REPO, "model", token)
ensure_repo(api, DATASET_REPO, "dataset", token)
print()
# ── REPO 1: Model (code + configs) ─────────────────────────────
print("[2/4] Uploading to MODEL repo:", MODEL_REPO)
print("-" * 50)
# Folders
model_folders = [
(PROJECT_ROOT / "src", "src"),
(PROJECT_ROOT / "scripts", "scripts"),
(PROJECT_ROOT / "configs", "configs"),
(PROJECT_ROOT / "data" / "tokenizer", "data/tokenizer"),
(PROJECT_ROOT / "tests", "tests"),
(PROJECT_ROOT / "api", "api"),
]
for local, remote in model_folders:
upload_folder(api, local, remote, MODEL_REPO, "model", token)
# Single files
model_files = [
(PROJECT_ROOT / "requirements.txt", "requirements.txt"),
(PROJECT_ROOT / "setup.py", "setup.py"),
(PROJECT_ROOT / "activate_mindi.bat", "activate_mindi.bat"),
(PROJECT_ROOT / ".env.example", ".env.example"),
]
for local, remote in model_files:
upload_file(api, local, remote, MODEL_REPO, "model", token)
# setup_mi300x.sh
mi300x_sh = PROJECT_ROOT / "setup_mi300x.sh"
if mi300x_sh.exists():
upload_file(api, mi300x_sh, "setup_mi300x.sh", MODEL_REPO, "model", token)
# Model card replaces README.md
upload_readme(api, MODEL_CARD, MODEL_REPO, "model", token)
print()
# ── REPO 2: Dataset ────────────────────────────────────────────
print("[3/4] Uploading to DATASET repo:", DATASET_REPO)
print("-" * 50)
processed = PROJECT_ROOT / "data" / "processed"
dataset_files = [
(processed / "train.jsonl", "processed/train.jsonl"),
(processed / "val.jsonl", "processed/val.jsonl"),
(processed / "test.jsonl", "processed/test.jsonl"),
(processed / "mindi_filtered.jsonl", "processed/mindi_filtered.jsonl"),
(processed / "filter_report.json", "processed/filter_report.json"),
(processed / "split_meta.json", "processed/split_meta.json"),
]
for local, remote in dataset_files:
upload_file(api, local, remote, DATASET_REPO, "dataset", token)
# Raw data folder
upload_folder(
api, PROJECT_ROOT / "data" / "raw", "raw",
DATASET_REPO, "dataset", token,
)
# Tokenizer copy in dataset repo
upload_folder(
api, PROJECT_ROOT / "data" / "tokenizer", "tokenizer",
DATASET_REPO, "dataset", token,
)
# Dataset card
upload_readme(api, DATASET_CARD, DATASET_REPO, "dataset", token)
print()
# ── Done ───────────────────────────────────────────────────────
print("[4/4] Upload complete!")
print()
print("╔══════════════════════════════════════╗")
print("β•‘ UPLOAD COMPLETE! β•‘")
print("β•‘ β•‘")
print("β•‘ Model repo: β•‘")
print("β•‘ huggingface.co/Mindigenous/ β•‘")
print("β•‘ MINDI-1.5-Vision-Coder β•‘")
print("β•‘ β•‘")
print("β•‘ Dataset repo: β•‘")
print("β•‘ huggingface.co/datasets/ β•‘")
print("β•‘ Mindigenous/MINDI-1.5-training-data β•‘")
print("β•‘ β•‘")
print("β•‘ On MI300X just run: β•‘")
print("β•‘ git clone https://huggingface.co/ β•‘")
print("β•‘ Mindigenous/MINDI-1.5-Vision-Coder β•‘")
print("β•‘ β•‘")
print("β•‘ Ready to train! πŸš€ β•‘")
print("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•")
if __name__ == "__main__":
main()