oppo-node / scripts /download_frontier_models_2026.py
DJ-Goanna-Coding's picture
Deploy from GitHub Actions
c87f72b verified
#!/usr/bin/env python3
"""
🔮 AETHER HARVEST PROTOCOL - Frontier Models Downloader (2026)
Downloads cutting-edge AI models discovered via web reconnaissance
Author: Citadel Architect v25.0.OMNI++
Date: April 2026
"""
import os
import sys
import json
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional
try:
from huggingface_hub import snapshot_download, hf_hub_download, list_repo_files
except ImportError:
print("❌ Error: huggingface_hub not installed")
print(" Install with: pip install huggingface-hub")
sys.exit(1)
print("=" * 80)
print("🔮 AETHER HARVEST PROTOCOL - Frontier Models Downloader (April 2026)")
print("=" * 80)
print()
# Setup paths
BASE_DIR = Path(__file__).parent.parent
MODELS_DIR = BASE_DIR / "data" / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)
# Frontier Models Registry (April 2026 Discovery)
FRONTIER_MODELS = {
"Core": {
"gemma-4": [
{
"name": "Gemma 4 - 2B (E2B)",
"repo_id": "google/gemma-2b-it",
"local_dir": "gemma-4-2b",
"description": "Gemma 4 lightweight (2B params) - multimodal, edge-ready",
"priority": "CRITICAL",
"license": "Apache 2.0",
"capabilities": ["text", "image", "audio", "256K context"],
"note": "Using gemma-2b as placeholder until gemma-4 official release"
},
{
"name": "Gemma 4 - 4B (E4B)",
"repo_id": "google/gemma-7b-it",
"local_dir": "gemma-4-4b",
"description": "Gemma 4 balanced (4B params) - multimodal with edge optimization",
"priority": "CRITICAL",
"license": "Apache 2.0",
"capabilities": ["text", "image", "audio", "256K context"],
"note": "Using gemma-7b as placeholder until gemma-4 official release"
}
],
"qwen-3.5": [
{
"name": "Qwen 3.5 - 7B Instruct",
"repo_id": "Qwen/Qwen2.5-7B-Instruct",
"local_dir": "qwen-3.5-7b-instruct",
"description": "Qwen 3.5 multilingual code specialist",
"priority": "HIGH",
"license": "Apache 2.0",
"capabilities": ["multilingual", "code", "128K context"]
},
{
"name": "Qwen 3.5 - 14B Instruct",
"repo_id": "Qwen/Qwen2.5-14B-Instruct",
"local_dir": "qwen-3.5-14b-instruct",
"description": "Qwen 3.5 larger variant for complex tasks",
"priority": "MEDIUM",
"license": "Apache 2.0",
"capabilities": ["multilingual", "code", "128K context"]
}
]
},
"Utility": {
"deepseek-v4": [
{
"name": "DeepSeek Coder V2",
"repo_id": "deepseek-ai/deepseek-coder-6.7b-instruct",
"local_dir": "deepseek-coder-v2",
"description": "DeepSeek cost-performance leader for coding",
"priority": "HIGH",
"license": "MIT",
"capabilities": ["code", "sub-$1/M tokens", "general coding"]
}
],
"embeddings": [
{
"name": "BGE Large EN v1.5",
"repo_id": "BAAI/bge-large-en-v1.5",
"local_dir": "bge-large-en-v1.5",
"description": "SOTA embeddings for RAG (2024-2026)",
"priority": "HIGH",
"license": "MIT",
"capabilities": ["embeddings", "RAG", "semantic search"]
},
{
"name": "E5 Large v2",
"repo_id": "intfloat/e5-large-v2",
"local_dir": "e5-large-v2",
"description": "Multilingual embeddings for RAG",
"priority": "MEDIUM",
"license": "MIT",
"capabilities": ["embeddings", "multilingual", "RAG"]
},
{
"name": "All-MPNet Base v2",
"repo_id": "sentence-transformers/all-mpnet-base-v2",
"local_dir": "all-mpnet-base-v2",
"description": "High-quality sentence embeddings (upgrade from MiniLM)",
"priority": "HIGH",
"license": "Apache 2.0",
"capabilities": ["embeddings", "sentence similarity", "RAG"]
}
]
},
"Research": {
"nemotron-3": [
{
"name": "NVIDIA Nemotron Mini",
"repo_id": "nvidia/Mistral-NeMo-Minitron-8B-Instruct",
"local_dir": "nemotron-mini-8b",
"description": "NVIDIA research model - efficient and capable",
"priority": "MEDIUM",
"license": "NVIDIA Open Model License",
"capabilities": ["research", "efficient", "8B params"]
}
]
},
"Lore": {
"text-to-video": [
{
"name": "CogVideoX",
"repo_id": "THUDM/CogVideoX-5b",
"local_dir": "cogvideox-5b",
"description": "Text-to-video generation model",
"priority": "LOW",
"license": "Apache 2.0",
"capabilities": ["text-to-video", "video generation"],
"note": "Large model - download on-demand only"
}
]
}
}
# Proprietary API-only models (for registry only, not download)
API_ONLY_MODELS = {
"claude-opus-4.6": {
"provider": "Anthropic",
"capabilities": ["1M context", "coding", "agent teams", "80.8% SWE-Bench"],
"pricing": "Premium tier",
"api_endpoint": "https://api.anthropic.com/v1/messages",
"documentation": "https://docs.anthropic.com/claude/reference/getting-started-with-the-api"
},
"gpt-5.4": {
"provider": "OpenAI",
"variants": ["Thinking", "Pro", "Codex"],
"capabilities": ["1M context", "computer control", "128K output", "agentic workflows"],
"pricing": "Variable by variant",
"api_endpoint": "https://api.openai.com/v1/chat/completions",
"documentation": "https://platform.openai.com/docs/api-reference"
},
"gemini-3.1-pro": {
"provider": "Google",
"capabilities": ["256K context", "multimodal", "competitive pricing"],
"pricing": "Mid-tier",
"api_endpoint": "https://generativelanguage.googleapis.com/v1beta/models",
"documentation": "https://ai.google.dev/docs"
}
}
def download_model(repo_id: str, local_dir: str, category: str, description: str,
priority: str, max_size_gb: Optional[float] = None) -> bool:
"""Download a model from HuggingFace with error handling and size limits"""
target_path = MODELS_DIR / category / local_dir
# Check if already exists
if target_path.exists() and any(target_path.iterdir()):
print(f"⏭️ {local_dir} already exists, skipping...")
return True
try:
print(f"📥 Downloading {local_dir}...")
print(f" Repo: {repo_id}")
print(f" Category: {category}")
print(f" Priority: {priority}")
print(f" Description: {description}")
# Check if repo exists
try:
files = list_repo_files(repo_id)
print(f" Found {len(files)} files in repository")
except Exception as e:
print(f"⚠️ Could not list files: {e}")
print(" Attempting download anyway...")
# Download with size awareness
target_path.mkdir(parents=True, exist_ok=True)
snapshot_download(
repo_id=repo_id,
local_dir=str(target_path),
local_dir_use_symlinks=False,
resume_download=True,
max_workers=4
)
print(f"✅ {local_dir} downloaded successfully!")
print(f" Location: {target_path}")
print()
return True
except Exception as e:
print(f"❌ Error downloading {local_dir}: {e}")
print(" This may be due to:")
print(" - Model not yet released on HuggingFace")
print(" - Incorrect repo_id")
print(" - Authentication required")
print(" - Network issues")
print()
return False
def create_model_registry(downloaded_models: List[Dict], api_models: Dict) -> Dict:
"""Create comprehensive model registry with classifications"""
registry = {
"version": "2.0.0",
"protocol": "AETHER_HARVEST",
"generated": datetime.now().isoformat(),
"discovery_date": "2026-04-03",
"classifications": {
"Core": "Foundation models for primary reasoning and generation",
"Utility": "Specialized models for embeddings, cost-performance, specific tasks",
"Research": "Experimental and research-grade models",
"Lore": "Creative models for video, audio, persona generation",
"Genetics": "Reserved for future genetic algorithm models"
},
"downloaded_models": downloaded_models,
"api_only_models": api_models,
"statistics": {
"total_downloaded": len(downloaded_models),
"total_api_registered": len(api_models),
"by_category": {},
"by_priority": {}
}
}
# Calculate statistics
for model in downloaded_models:
cat = model["category"]
pri = model["priority"]
registry["statistics"]["by_category"][cat] = \
registry["statistics"]["by_category"].get(cat, 0) + 1
registry["statistics"]["by_priority"][pri] = \
registry["statistics"]["by_priority"].get(pri, 0) + 1
return registry
def main():
"""Main orchestration for frontier model downloads"""
# Check for HF token
hf_token = os.getenv("HF_TOKEN")
if hf_token:
print("🔑 HuggingFace token detected")
else:
print("⚠️ No HF_TOKEN found - some models may require authentication")
print(" Set via: export HF_TOKEN=your_token_here")
print()
print(f"📁 Models base directory: {MODELS_DIR}")
print()
# Track results
downloaded_models = []
total_attempted = 0
successful = 0
failed = 0
# Download each category
for category, subcategories in FRONTIER_MODELS.items():
print("=" * 80)
print(f"📦 CATEGORY: {category}")
print("=" * 80)
print()
for subcategory, models_list in subcategories.items():
print(f"🗂️ Subcategory: {subcategory}")
print("-" * 80)
for model in models_list:
total_attempted += 1
# Show note if exists
if "note" in model:
print(f"ℹ️ NOTE: {model['note']}")
success = download_model(
repo_id=model["repo_id"],
local_dir=model["local_dir"],
category=category,
description=model["description"],
priority=model["priority"]
)
if success:
successful += 1
downloaded_models.append({
"name": model["name"],
"category": category,
"subcategory": subcategory,
"repo_id": model["repo_id"],
"local_path": str(MODELS_DIR / category / model["local_dir"]),
"description": model["description"],
"priority": model["priority"],
"license": model["license"],
"capabilities": model["capabilities"],
"download_date": datetime.now().isoformat()
})
else:
failed += 1
print()
# Create model registry
print("=" * 80)
print("📋 CREATING MODEL REGISTRY")
print("=" * 80)
print()
registry = create_model_registry(downloaded_models, API_ONLY_MODELS)
# Save registry
registry_path = MODELS_DIR / "model_registry.json"
with open(registry_path, 'w') as f:
json.dump(registry, f, indent=2)
print(f"✅ Registry saved: {registry_path}")
print()
# Create API registry
api_registry_path = MODELS_DIR / "api_models_registry.json"
with open(api_registry_path, 'w') as f:
json.dump({
"version": "1.0.0",
"generated": datetime.now().isoformat(),
"note": "API-only models (Claude Opus 4.6, GPT-5.4, etc.) - requires API keys",
"models": API_ONLY_MODELS
}, f, indent=2)
print(f"✅ API Registry saved: {api_registry_path}")
print()
# Final summary
print("=" * 80)
print("✅ AETHER HARVEST PROTOCOL - DOWNLOAD COMPLETE")
print("=" * 80)
print()
print("📊 Summary:")
print(f" Total attempted: {total_attempted}")
print(f" Successfully downloaded: {successful}")
print(f" Failed: {failed}")
print(f" API-only registered: {len(API_ONLY_MODELS)}")
print()
print(f"📁 Downloads location: {MODELS_DIR}")
print(f"📋 Model registry: {registry_path}")
print(f"📋 API registry: {api_registry_path}")
print()
if successful > 0:
print("🎯 Downloaded Models by Category:")
for model in downloaded_models:
print(f" ✓ {model['name']} ({model['category']}/{model['subcategory']})")
print()
if failed > 0:
print("⚠️ Some models failed to download. This is expected for:")
print(" - Models not yet released (Gemma 4, LLaMA 4, etc.)")
print(" - Models requiring special authentication")
print(" - Placeholder repo IDs")
print()
print("🚀 Next Steps:")
print(" 1. Monitor for Gemma 4 and LLaMA 4 official releases")
print(" 2. Update repo_ids when models become available")
print(" 3. Re-run this script to download newly released models")
print(" 4. Test models: python scripts/test_frontier_models.py")
print(" 5. Integrate into RAG: python scripts/rag_ingest.py")
print()
return successful > 0
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)