# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/attribute/fox_wolf_snow.yaml pretrained_model_path: "./ckpt/stable-diffusion-v1-4" train_dataset: path: "data/attribute/fox_wolf_snow" prompt: "a white fox sitting in the grass" n_sample_frame: 8 # n_sample_frame: 22 sampling_rate: 1 stride: 80 offset: left: 0 right: 0 top: 0 bottom: 0 validation_sample_logger_config: use_train_latents: True use_inversion_attention: True guidance_scale: 7.5 prompts: [ # source prompt a white fox sitting in the grass, # foreground texture style a grey wolf sitting in the grass, a white fox sitting in the snow ] p2p_config: 0: # Whether to directly copy the cross attention from source # True: directly copy, better for object replacement # False: keep source attention, better for style is_replace_controller: False # Semantic preserving and replacement Debug me cross_replace_steps: default_: 0.8 # Source background structure preserving, in [0, 1]. # e.g., =0.6 Replace the first 60% steps self-attention self_replace_steps: 0.6 # Amplify the target-words cross attention, larger value, more close to target eq_params: words: ["silver", "sculpture"] values: [2,2] # Target structure-divergence hyperparames # If you change the shape of object better to use all three line, otherwise, no need. # Without following three lines, all self-attention will be replaced blend_words: [['cat',], ["cat",]] masked_self_attention: True # masked_latents: False # performance not so good in our case, need debug bend_th: [2, 2] # preserve source structure of blend_words , [0, 1] # default is bend_th: [2, 2] # preserve all source self-attention # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention 1: is_replace_controller: false cross_replace_steps: default_: 0.5 self_replace_steps: 0.5 eq_params: words: ["robotic"] values: [10] # amplify attention to the word "tiger" by *2 2: is_replace_controller: false cross_replace_steps: default_: 0.5 self_replace_steps: 0.5 eq_params: words: ["snow"] values: [10] # amplify attention to the word "tiger" by *2 clip_length: "${..train_dataset.n_sample_frame}" sample_seeds: [0] val_all_frames: False num_inference_steps: 50 prompt2prompt_edit: True model_config: lora: 160 # temporal_downsample_time: 4 SparseCausalAttention_index: ['mid'] least_sc_channel: 640 # least_sc_channel: 100000 test_pipeline_config: target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline num_inference_steps: "${..validation_sample_logger.num_inference_steps}" epsilon: 1e-5 train_steps: 10 seed: 0 learning_rate: 1e-5 train_temporal_conv: False guidance_scale: "${validation_sample_logger_config.guidance_scale}"