Spaces:
Running
Running
| # Training Model Registry | |
| # Defines base models available for LoRA training with their optimal parameters | |
| training_models: | |
| # FLUX - Best for photorealistic images (recommended for realistic person) | |
| flux2_dev: | |
| name: "FLUX.2 Dev (Recommended)" | |
| description: "Latest FLUX model, 32B params, best quality for realistic person. Uses Mistral text encoder." | |
| hf_repo: "black-forest-labs/FLUX.2-dev" | |
| hf_filename: "flux2-dev.safetensors" | |
| model_type: "flux2" | |
| training_framework: "musubi-tuner" | |
| resolution: 1024 | |
| learning_rate: 1.0 | |
| network_rank: 64 | |
| network_alpha: 32 | |
| optimizer: "prodigy" | |
| lr_scheduler: "constant" | |
| timestep_sampling: "flux2_shift" | |
| network_module: "networks.lora_flux_2" | |
| max_train_steps: 50 | |
| fp8_base: true | |
| gradient_checkpointing: true | |
| use_case: "images" | |
| vram_required_gb: 48 | |
| recommended_gpu: "NVIDIA RTX A6000" | |
| recommended_images: "15-30 high quality photos with detailed captions" | |
| training_script: "flux_2_train_network.py" | |
| # Model paths on network volume: | |
| # DiT: /workspace/models/FLUX.2-dev/flux2-dev.safetensors | |
| # VAE: /workspace/models/FLUX.2-dev/vae/diffusion_pytorch_model.safetensors | |
| # Text encoder: /workspace/models/FLUX.2-dev/text_encoder/model-00001-of-00010.safetensors | |
| flux1_dev: | |
| name: "FLUX.1 Dev" | |
| description: "Previous gen FLUX, still excellent for realistic person LoRAs" | |
| hf_repo: "black-forest-labs/FLUX.1-dev" | |
| hf_filename: "flux1-dev.safetensors" | |
| model_type: "flux" | |
| resolution: 768 | |
| learning_rate: 4e-4 | |
| text_encoder_lr: 4e-5 | |
| network_rank: 32 | |
| network_alpha: 16 | |
| clip_skip: 1 | |
| optimizer: "AdamW8bit" | |
| lr_scheduler: "cosine" | |
| min_snr_gamma: 5 | |
| max_train_steps: 1500 | |
| use_case: "images" | |
| vram_required_gb: 24 | |
| recommended_images: "15-30 high quality photos" | |
| training_script: "flux_train_network.py" | |
| # WAN 2.2 - Text-to-Video LoRA training (14B params, uses musubi-tuner) | |
| wan22_t2v: | |
| name: "WAN 2.2 T2V (14B)" | |
| description: "WAN 2.2 text-to-video model. Trains natural-looking video LoRAs. Requires A100 80GB." | |
| model_type: "wan22" | |
| training_framework: "musubi-tuner" | |
| training_script: "wan_train_network.py" | |
| network_module: "networks.lora_wan" | |
| resolution: 512 | |
| learning_rate: 2e-4 | |
| network_rank: 64 | |
| network_alpha: 32 | |
| optimizer: "adamw8bit" | |
| lr_scheduler: "constant" | |
| timestep_sampling: "shift" | |
| discrete_flow_shift: 5.0 | |
| gradient_checkpointing: true | |
| max_train_steps: 2000 | |
| save_every_n_steps: 500 | |
| use_case: "images+video" | |
| vram_required_gb: 48 | |
| recommended_gpu: "NVIDIA A100 80GB" | |
| recommended_images: "20-50 high quality photos with detailed captions" | |
| # Model paths on network volume: | |
| # DiT low-noise: /workspace/models/WAN2.2/wan2.2_t2v_low_noise_14B_fp16.safetensors | |
| # DiT high-noise: /workspace/models/WAN2.2/wan2.2_t2v_high_noise_14B_fp16.safetensors | |
| # VAE: /workspace/models/WAN2.2/Wan2.1_VAE.pth | |
| # T5: /workspace/models/WAN2.2/models_t5_umt5-xxl-enc-bf16.pth | |
| # SD 1.5 Realistic Vision - Good balance of quality and speed | |
| sd15_realistic: | |
| name: "Realistic Vision V5.1" | |
| description: "SD 1.5 based, great for realistic humans, faster training" | |
| hf_repo: "SG161222/Realistic_Vision_V5.1_noVAE" | |
| hf_filename: "Realistic_Vision_V5.1_fp16-no-ema.safetensors" | |
| model_type: "sd15" | |
| resolution: 512 | |
| learning_rate: 1e-4 | |
| network_rank: 32 | |
| network_alpha: 16 | |
| clip_skip: 1 | |
| optimizer: "AdamW8bit" | |
| use_case: "images" | |
| vram_required_gb: 8 | |
| recommended_images: "15-30 photos" | |
| # SDXL - Higher quality than SD 1.5, but more VRAM | |
| sdxl_base: | |
| name: "SDXL Base 1.0" | |
| description: "Higher resolution and quality than SD 1.5" | |
| hf_repo: "stabilityai/stable-diffusion-xl-base-1.0" | |
| hf_filename: "sd_xl_base_1.0.safetensors" | |
| model_type: "sdxl" | |
| resolution: 1024 | |
| learning_rate: 1e-4 | |
| network_rank: 32 | |
| network_alpha: 16 | |
| clip_skip: 2 | |
| optimizer: "AdamW8bit" | |
| use_case: "images" | |
| vram_required_gb: 12 | |
| recommended_images: "20-40 photos" | |
| # Video generation models (for img2video, not training) | |
| video_models: | |
| wan22_i2v: | |
| name: "WAN 2.2 Image-to-Video" | |
| description: "Converts images to videos, use with your trained LoRA images" | |
| hf_repo: "Wan-AI/Wan2.2-I2V-A14B" | |
| model_type: "wan22" | |
| use_case: "img2video" | |
| vram_required_gb: 24 | |
| resolution: "480p/720p" | |
| # Default model for training | |
| default_training_model: "flux2_dev" | |