# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/low_resource_teaser/jeep_watercolor.yaml pretrained_model_path: "FateZero/ckpt/stable-diffusion-v1-4" train_dataset: path: "FateZero/data/teaser_car-turn" prompt: "a silver jeep driving down a curvy road in the countryside" n_sample_frame: 8 sampling_rate: 1 stride: 80 offset: left: 0 right: 0 top: 0 bottom: 0 validation_sample_logger_config: use_train_latents: true use_inversion_attention: true guidance_scale: 7.5 source_prompt: "${train_dataset.prompt}" prompts: [ # a silver jeep driving down a curvy road in the countryside, watercolor painting of a silver jeep driving down a curvy road in the countryside, ] annotate: False p2p_config: 0: # Whether to directly copy the cross attention from source # True: directly copy, better for object replacement # False: keep source attention, better for style is_replace_controller: False # Semantic layout preserving. High steps, replace more cross attention to preserve semantic layout cross_replace_steps: default_: 0.8 # Source background structure preserving, in [0, 1]. # e.g., =0.6 Replace the first 60% steps self-attention self_replace_steps: 0.8 # Amplify the target-words cross attention, larger value, more close to target eq_params: words: ["watercolor"] values: [10,10] # Target structure-divergence hyperparames # If you change the shape of object better to use all three line, otherwise, no need. # Without following three lines, all self-attention will be replaced # blend_words: [['jeep',], ["car",]] # masked_self_attention: True # masked_latents: False # performance not so good in our case, need debug # bend_th: [2, 2] # preserve source structure of blend_words , [0, 1] # default is bend_th: [2, 2] # replace full-resolution edit source with self-attention # bend_th-> [0.0, 0.0], mask -> 1, use more edit self-attention, more generated shape, less source acttention clip_length: "${..train_dataset.n_sample_frame}" sample_seeds: [0] num_inference_steps: 10 prompt2prompt_edit: True model_config: lora: 160 # temporal_downsample_time: 4 SparseCausalAttention_index: ['mid'] least_sc_channel: 640 # least_sc_channel: 100000 test_pipeline_config: target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline num_inference_steps: "${..validation_sample_logger.num_inference_steps}" epsilon: 1e-5 train_steps: 10 seed: 0 learning_rate: 1e-5 train_temporal_conv: False guidance_scale: "${validation_sample_logger_config.guidance_scale}"