|
data: |
|
data_dir: [] |
|
image_size: 1024 |
|
caption_proportion: |
|
prompt: 1 |
|
external_caption_suffixes: [] |
|
external_clipscore_suffixes: [] |
|
clip_thr_temperature: 0.1 |
|
clip_thr: 25.0 |
|
load_text_feat: false |
|
load_vae_feat: false |
|
transform: default_train |
|
type: SanaWebDatasetMS |
|
data: |
|
sort_dataset: false |
|
|
|
model: |
|
model: SanaMS_1600M_P1_D20 |
|
image_size: 1024 |
|
mixed_precision: fp16 |
|
fp32_attention: true |
|
load_from: |
|
resume_from: |
|
aspect_ratio_type: ASPECT_RATIO_1024 |
|
multi_scale: true |
|
|
|
attn_type: linear |
|
ffn_type: glumbconv |
|
mlp_acts: |
|
- silu |
|
- silu |
|
- |
|
mlp_ratio: 2.5 |
|
use_pe: false |
|
qk_norm: false |
|
class_dropout_prob: 0.1 |
|
|
|
pag_applied_layers: |
|
- 8 |
|
|
|
vae: |
|
vae_type: dc-ae |
|
vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0 |
|
scale_factor: 0.41407 |
|
vae_latent_dim: 32 |
|
vae_downsample_rate: 32 |
|
sample_posterior: true |
|
|
|
text_encoder: |
|
text_encoder_name: gemma-2-2b-it |
|
y_norm: true |
|
y_norm_scale_factor: 0.01 |
|
model_max_length: 300 |
|
|
|
chi_prompt: |
|
- 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:' |
|
- '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.' |
|
- '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.' |
|
- 'Here are examples of how to transform or refine prompts:' |
|
- '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.' |
|
- '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.' |
|
- 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:' |
|
- 'User Prompt: ' |
|
|
|
scheduler: |
|
predict_v: true |
|
noise_schedule: linear_flow |
|
pred_sigma: false |
|
flow_shift: 3.0 |
|
|
|
weighting_scheme: logit_normal |
|
logit_mean: 0.0 |
|
logit_std: 1.0 |
|
vis_sampler: flow_dpm-solver |
|
|
|
train: |
|
num_workers: 10 |
|
seed: 1 |
|
train_batch_size: 64 |
|
num_epochs: 100 |
|
gradient_accumulation_steps: 1 |
|
grad_checkpointing: true |
|
gradient_clip: 0.1 |
|
optimizer: |
|
betas: |
|
- 0.9 |
|
- 0.999 |
|
- 0.9999 |
|
eps: |
|
- 1.0e-30 |
|
- 1.0e-16 |
|
lr: 0.0001 |
|
type: CAMEWrapper |
|
weight_decay: 0.0 |
|
lr_schedule: constant |
|
lr_schedule_args: |
|
num_warmup_steps: 2000 |
|
local_save_vis: true |
|
visualize: true |
|
eval_sampling_steps: 500 |
|
log_interval: 20 |
|
save_model_epochs: 5 |
|
save_model_steps: 500 |
|
work_dir: output/debug |
|
online_metric: false |
|
eval_metric_step: 2000 |
|
online_metric_dir: metric_helper |
|
|