Spaces:
Running
Running
File size: 2,342 Bytes
966ae59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
method: 'diffsketcher'
image_size: 224 # canvas size
path_svg: ~ # if you want to load a svg file and train from it
mask_object: False # if the target image contains background, it's better to mask it out
fix_scale: False # if the target image is not squared, it is recommended to fix the scale
# train
num_iter: 2000
num_stages: 1 # training stages, you can train x strokes, then freeze them and train another x strokes etc
lr_schedule: False
lr_decay_rate: 0.1
decay_steps: [ 1000, 1500 ]
lr: 1
color_lr: 0.01
color_vars_threshold: 0.0 # uncomment the code
width_lr: 0.1
max_width: 50 # stroke width
# stroke attrs
num_paths: 128 # number of strokes
width: 1.5 # stroke width
control_points_per_seg: 4
num_segments: 1
optim_opacity: True # if True, the stroke opacity is optimized
optim_width: False # if True, the stroke width is optimized
optim_rgba: False # if True, the stroke RGBA is optimized
opacity_delta: 0 # stroke pruning
# init strokes
attention_init: True # if True, use the attention heads of Dino model to set the location of the initial strokes
xdog_intersec: True # initialize along the edge, mix XDoG and attn up
softmax_temp: 0.5
cross_attn_res: 16
self_attn_res: 32
max_com: 20
mean_comp: False
comp_idx: 0
attn_coeff: 1.0 # attn fusion, w * cross-attn + (1-w) * self-attn
log_cross_attn: False # True if cross attn every step
u2net_path: "./checkpoint/u2net/u2net.pth"
# ldm
model_id: "sd15"
ldm_speed_up: False
enable_xformers: True
gradient_checkpoint: False
token_ind: 5
use_ddim: True
num_inference_steps: 100
guidance_scale: 7.5 # sdxl default 5.0
# ASDS loss
sds:
crop_size: 512
augmentations: "affine"
guidance_scale: 100
grad_scale: 1e-6
t_range: [ 0.05, 0.95 ]
warmup: 2000
clip:
model_name: "RN101" # RN101, ViT-L/14
feats_loss_type: "l2" # clip visual loss type, conv layers
feats_loss_weights: [ 0,0,1.0,1.0,0 ] # RN based
# feats_loss_weights: [ 0,0,1.0,1.0,0,0,0,0,0,0,0,0 ] # ViT based
fc_loss_weight: 0.1 # clip visual loss, fc layer weight
augmentations: "affine" # augmentation before clip visual computation
num_aug: 4 # num of augmentation before clip visual computation
vis_loss: 1 # 1 or 0 for use or disable clip visual loss
text_visual_coeff: 0 # cosine similarity between text and img
perceptual:
name: "lpips" # dists
lpips_net: 'vgg'
coeff: 0.2 |