Spaces:

DJ-Goanna-Coding
/

oppo-node

Sleeping

App Files Files Community

oppo-node / scripts /download_frontier_models_2026.py

DJ-Goanna-Coding

Deploy from GitHub Actions

c87f72b verified 28 days ago

raw

history blame contribute delete

14.5 kB

	#!/usr/bin/env python3
	"""
	🔮 AETHER HARVEST PROTOCOL - Frontier Models Downloader (2026)
	Downloads cutting-edge AI models discovered via web reconnaissance
	Author: Citadel Architect v25.0.OMNI++
	Date: April 2026
	"""

	import os
	import sys
	import json
	from pathlib import Path
	from datetime import datetime
	from typing import Dict, List, Optional

	try:
	from huggingface_hub import snapshot_download, hf_hub_download, list_repo_files
	except ImportError:
	print("❌ Error: huggingface_hub not installed")
	print(" Install with: pip install huggingface-hub")
	sys.exit(1)

	print("=" * 80)
	print("🔮 AETHER HARVEST PROTOCOL - Frontier Models Downloader (April 2026)")
	print("=" * 80)
	print()

	# Setup paths
	BASE_DIR = Path(__file__).parent.parent
	MODELS_DIR = BASE_DIR / "data" / "models"
	MODELS_DIR.mkdir(parents=True, exist_ok=True)

	# Frontier Models Registry (April 2026 Discovery)
	FRONTIER_MODELS = {
	"Core": {
	"gemma-4": [
	{
	"name": "Gemma 4 - 2B (E2B)",
	"repo_id": "google/gemma-2b-it",
	"local_dir": "gemma-4-2b",
	"description": "Gemma 4 lightweight (2B params) - multimodal, edge-ready",
	"priority": "CRITICAL",
	"license": "Apache 2.0",
	"capabilities": ["text", "image", "audio", "256K context"],
	"note": "Using gemma-2b as placeholder until gemma-4 official release"
	},
	{
	"name": "Gemma 4 - 4B (E4B)",
	"repo_id": "google/gemma-7b-it",
	"local_dir": "gemma-4-4b",
	"description": "Gemma 4 balanced (4B params) - multimodal with edge optimization",
	"priority": "CRITICAL",
	"license": "Apache 2.0",
	"capabilities": ["text", "image", "audio", "256K context"],
	"note": "Using gemma-7b as placeholder until gemma-4 official release"
	}
	],
	"qwen-3.5": [
	{
	"name": "Qwen 3.5 - 7B Instruct",
	"repo_id": "Qwen/Qwen2.5-7B-Instruct",
	"local_dir": "qwen-3.5-7b-instruct",
	"description": "Qwen 3.5 multilingual code specialist",
	"priority": "HIGH",
	"license": "Apache 2.0",
	"capabilities": ["multilingual", "code", "128K context"]
	},
	{
	"name": "Qwen 3.5 - 14B Instruct",
	"repo_id": "Qwen/Qwen2.5-14B-Instruct",
	"local_dir": "qwen-3.5-14b-instruct",
	"description": "Qwen 3.5 larger variant for complex tasks",
	"priority": "MEDIUM",
	"license": "Apache 2.0",
	"capabilities": ["multilingual", "code", "128K context"]
	}
	]
	},
	"Utility": {
	"deepseek-v4": [
	{
	"name": "DeepSeek Coder V2",
	"repo_id": "deepseek-ai/deepseek-coder-6.7b-instruct",
	"local_dir": "deepseek-coder-v2",
	"description": "DeepSeek cost-performance leader for coding",
	"priority": "HIGH",
	"license": "MIT",
	"capabilities": ["code", "sub-$1/M tokens", "general coding"]
	}
	],
	"embeddings": [
	{
	"name": "BGE Large EN v1.5",
	"repo_id": "BAAI/bge-large-en-v1.5",
	"local_dir": "bge-large-en-v1.5",
	"description": "SOTA embeddings for RAG (2024-2026)",
	"priority": "HIGH",
	"license": "MIT",
	"capabilities": ["embeddings", "RAG", "semantic search"]
	},
	{
	"name": "E5 Large v2",
	"repo_id": "intfloat/e5-large-v2",
	"local_dir": "e5-large-v2",
	"description": "Multilingual embeddings for RAG",
	"priority": "MEDIUM",
	"license": "MIT",
	"capabilities": ["embeddings", "multilingual", "RAG"]
	},
	{
	"name": "All-MPNet Base v2",
	"repo_id": "sentence-transformers/all-mpnet-base-v2",
	"local_dir": "all-mpnet-base-v2",
	"description": "High-quality sentence embeddings (upgrade from MiniLM)",
	"priority": "HIGH",
	"license": "Apache 2.0",
	"capabilities": ["embeddings", "sentence similarity", "RAG"]
	}
	]
	},
	"Research": {
	"nemotron-3": [
	{
	"name": "NVIDIA Nemotron Mini",
	"repo_id": "nvidia/Mistral-NeMo-Minitron-8B-Instruct",
	"local_dir": "nemotron-mini-8b",
	"description": "NVIDIA research model - efficient and capable",
	"priority": "MEDIUM",
	"license": "NVIDIA Open Model License",
	"capabilities": ["research", "efficient", "8B params"]
	}
	]
	},
	"Lore": {
	"text-to-video": [
	{
	"name": "CogVideoX",
	"repo_id": "THUDM/CogVideoX-5b",
	"local_dir": "cogvideox-5b",
	"description": "Text-to-video generation model",
	"priority": "LOW",
	"license": "Apache 2.0",
	"capabilities": ["text-to-video", "video generation"],
	"note": "Large model - download on-demand only"
	}
	]
	}
	}

	# Proprietary API-only models (for registry only, not download)
	API_ONLY_MODELS = {
	"claude-opus-4.6": {
	"provider": "Anthropic",
	"capabilities": ["1M context", "coding", "agent teams", "80.8% SWE-Bench"],
	"pricing": "Premium tier",
	"api_endpoint": "https://api.anthropic.com/v1/messages",
	"documentation": "https://docs.anthropic.com/claude/reference/getting-started-with-the-api"
	},
	"gpt-5.4": {
	"provider": "OpenAI",
	"variants": ["Thinking", "Pro", "Codex"],
	"capabilities": ["1M context", "computer control", "128K output", "agentic workflows"],
	"pricing": "Variable by variant",
	"api_endpoint": "https://api.openai.com/v1/chat/completions",
	"documentation": "https://platform.openai.com/docs/api-reference"
	},
	"gemini-3.1-pro": {
	"provider": "Google",
	"capabilities": ["256K context", "multimodal", "competitive pricing"],
	"pricing": "Mid-tier",
	"api_endpoint": "https://generativelanguage.googleapis.com/v1beta/models",
	"documentation": "https://ai.google.dev/docs"
	}
	}


	def download_model(repo_id: str, local_dir: str, category: str, description: str,
	priority: str, max_size_gb: Optional[float] = None) -> bool:
	"""Download a model from HuggingFace with error handling and size limits"""

	target_path = MODELS_DIR / category / local_dir

	# Check if already exists
	if target_path.exists() and any(target_path.iterdir()):
	print(f"⏭️ {local_dir} already exists, skipping...")
	return True

	try:
	print(f"📥 Downloading {local_dir}...")
	print(f" Repo: {repo_id}")
	print(f" Category: {category}")
	print(f" Priority: {priority}")
	print(f" Description: {description}")

	# Check if repo exists
	try:
	files = list_repo_files(repo_id)
	print(f" Found {len(files)} files in repository")
	except Exception as e:
	print(f"⚠️ Could not list files: {e}")
	print(" Attempting download anyway...")

	# Download with size awareness
	target_path.mkdir(parents=True, exist_ok=True)

	snapshot_download(
	repo_id=repo_id,
	local_dir=str(target_path),
	local_dir_use_symlinks=False,
	resume_download=True,
	max_workers=4
	)

	print(f"✅ {local_dir} downloaded successfully!")
	print(f" Location: {target_path}")
	print()
	return True

	except Exception as e:
	print(f"❌ Error downloading {local_dir}: {e}")
	print(" This may be due to:")
	print(" - Model not yet released on HuggingFace")
	print(" - Incorrect repo_id")
	print(" - Authentication required")
	print(" - Network issues")
	print()
	return False


	def create_model_registry(downloaded_models: List[Dict], api_models: Dict) -> Dict:
	"""Create comprehensive model registry with classifications"""

	registry = {
	"version": "2.0.0",
	"protocol": "AETHER_HARVEST",
	"generated": datetime.now().isoformat(),
	"discovery_date": "2026-04-03",
	"classifications": {
	"Core": "Foundation models for primary reasoning and generation",
	"Utility": "Specialized models for embeddings, cost-performance, specific tasks",
	"Research": "Experimental and research-grade models",
	"Lore": "Creative models for video, audio, persona generation",
	"Genetics": "Reserved for future genetic algorithm models"
	},
	"downloaded_models": downloaded_models,
	"api_only_models": api_models,
	"statistics": {
	"total_downloaded": len(downloaded_models),
	"total_api_registered": len(api_models),
	"by_category": {},
	"by_priority": {}
	}
	}

	# Calculate statistics
	for model in downloaded_models:
	cat = model["category"]
	pri = model["priority"]

	registry["statistics"]["by_category"][cat] = \
	registry["statistics"]["by_category"].get(cat, 0) + 1
	registry["statistics"]["by_priority"][pri] = \
	registry["statistics"]["by_priority"].get(pri, 0) + 1

	return registry


	def main():
	"""Main orchestration for frontier model downloads"""

	# Check for HF token
	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	print("🔑 HuggingFace token detected")
	else:
	print("⚠️ No HF_TOKEN found - some models may require authentication")
	print(" Set via: export HF_TOKEN=your_token_here")
	print()

	print(f"📁 Models base directory: {MODELS_DIR}")
	print()

	# Track results
	downloaded_models = []
	total_attempted = 0
	successful = 0
	failed = 0

	# Download each category
	for category, subcategories in FRONTIER_MODELS.items():
	print("=" * 80)
	print(f"📦 CATEGORY: {category}")
	print("=" * 80)
	print()

	for subcategory, models_list in subcategories.items():
	print(f"🗂️ Subcategory: {subcategory}")
	print("-" * 80)

	for model in models_list:
	total_attempted += 1

	# Show note if exists
	if "note" in model:
	print(f"ℹ️ NOTE: {model['note']}")

	success = download_model(
	repo_id=model["repo_id"],
	local_dir=model["local_dir"],
	category=category,
	description=model["description"],
	priority=model["priority"]
	)

	if success:
	successful += 1
	downloaded_models.append({
	"name": model["name"],
	"category": category,
	"subcategory": subcategory,
	"repo_id": model["repo_id"],
	"local_path": str(MODELS_DIR / category / model["local_dir"]),
	"description": model["description"],
	"priority": model["priority"],
	"license": model["license"],
	"capabilities": model["capabilities"],
	"download_date": datetime.now().isoformat()
	})
	else:
	failed += 1

	print()

	# Create model registry
	print("=" * 80)
	print("📋 CREATING MODEL REGISTRY")
	print("=" * 80)
	print()

	registry = create_model_registry(downloaded_models, API_ONLY_MODELS)

	# Save registry
	registry_path = MODELS_DIR / "model_registry.json"
	with open(registry_path, 'w') as f:
	json.dump(registry, f, indent=2)

	print(f"✅ Registry saved: {registry_path}")
	print()

	# Create API registry
	api_registry_path = MODELS_DIR / "api_models_registry.json"
	with open(api_registry_path, 'w') as f:
	json.dump({
	"version": "1.0.0",
	"generated": datetime.now().isoformat(),
	"note": "API-only models (Claude Opus 4.6, GPT-5.4, etc.) - requires API keys",
	"models": API_ONLY_MODELS
	}, f, indent=2)

	print(f"✅ API Registry saved: {api_registry_path}")
	print()

	# Final summary
	print("=" * 80)
	print("✅ AETHER HARVEST PROTOCOL - DOWNLOAD COMPLETE")
	print("=" * 80)
	print()
	print("📊 Summary:")
	print(f" Total attempted: {total_attempted}")
	print(f" Successfully downloaded: {successful}")
	print(f" Failed: {failed}")
	print(f" API-only registered: {len(API_ONLY_MODELS)}")
	print()
	print(f"📁 Downloads location: {MODELS_DIR}")
	print(f"📋 Model registry: {registry_path}")
	print(f"📋 API registry: {api_registry_path}")
	print()

	if successful > 0:
	print("🎯 Downloaded Models by Category:")
	for model in downloaded_models:
	print(f" ✓ {model['name']} ({model['category']}/{model['subcategory']})")
	print()

	if failed > 0:
	print("⚠️ Some models failed to download. This is expected for:")
	print(" - Models not yet released (Gemma 4, LLaMA 4, etc.)")
	print(" - Models requiring special authentication")
	print(" - Placeholder repo IDs")
	print()

	print("🚀 Next Steps:")
	print(" 1. Monitor for Gemma 4 and LLaMA 4 official releases")
	print(" 2. Update repo_ids when models become available")
	print(" 3. Re-run this script to download newly released models")
	print(" 4. Test models: python scripts/test_frontier_models.py")
	print(" 5. Integrate into RAG: python scripts/rag_ingest.py")
	print()

	return successful > 0


	if __name__ == "__main__":
	success = main()
	sys.exit(0 if success else 1)