data: data_dir: [] image_size: 1024 caption_proportion: prompt: 1 external_caption_suffixes: [] external_clipscore_suffixes: [] clip_thr_temperature: 0.1 clip_thr: 25.0 load_text_feat: false load_vae_feat: true transform: default_train type: SanaWebDatasetMS sort_dataset: false # model config model: model: SanaMS_600M_P1_D28 image_size: 1024 mixed_precision: fp16 # ['fp16', 'fp32', 'bf16'] fp32_attention: true load_from: resume_from: aspect_ratio_type: ASPECT_RATIO_1024 multi_scale: true attn_type: linear ffn_type: glumbconv mlp_acts: - silu - silu - mlp_ratio: 2.5 use_pe: false qk_norm: false class_dropout_prob: 0.1 # CFG & PAG settings pag_applied_layers: - 14 # VAE setting vae: vae_type: dc-ae vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0 scale_factor: 0.41407 vae_latent_dim: 32 vae_downsample_rate: 32 sample_posterior: true # text encoder text_encoder: text_encoder_name: gemma-2-2b-it y_norm: true y_norm_scale_factor: 0.01 model_max_length: 300 # CHI chi_prompt: - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:' - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.' - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.' - 'Here are examples of how to transform or refine prompts:' - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.' - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.' - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:' - 'User Prompt: ' # Sana schedule Flow scheduler: predict_v: true noise_schedule: linear_flow pred_sigma: false flow_shift: 4.0 # logit-normal timestep weighting_scheme: logit_normal logit_mean: 0.0 logit_std: 1.0 vis_sampler: flow_dpm-solver # training setting train: num_workers: 10 seed: 1 train_batch_size: 64 num_epochs: 100 gradient_accumulation_steps: 1 grad_checkpointing: true gradient_clip: 0.1 optimizer: betas: - 0.9 - 0.999 - 0.9999 eps: - 1.0e-30 - 1.0e-16 lr: 0.0001 type: CAMEWrapper weight_decay: 0.0 lr_schedule: constant lr_schedule_args: num_warmup_steps: 2000 local_save_vis: true # if save log image locally visualize: true eval_sampling_steps: 500 log_interval: 20 save_model_epochs: 5 save_model_steps: 500 work_dir: output/debug online_metric: false eval_metric_step: 2000 online_metric_dir: metric_helper