# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/teaser/jeep_posche.yaml pretrained_model_path: "./ckpt/jeep_tuned_200" train_dataset: path: "data/teaser_car-turn" prompt: "a silver jeep driving down a curvy road in the countryside," n_sample_frame: 8 sampling_rate: 1 stride: 80 offset: left: 0 right: 0 top: 0 bottom: 0 validation_sample_logger_config: use_train_latents: true use_inversion_attention: true guidance_scale: 7.5 prompts: [ a silver jeep driving down a curvy road in the countryside, a Porsche car driving down a curvy road in the countryside, ] p2p_config: 0: # Whether to directly copy the cross attention from source # True: directly copy, better for object replacement # False: keep source attention, better for style is_replace_controller: False # Semantic layout preserving. High steps, replace more cross attention to preserve semantic layout cross_replace_steps: default_: 0.8 # Source background structure preserving, in [0, 1]. # e.g., =0.6 Replace the first 60% steps self-attention self_replace_steps: 0.9 # Amplify the target-words cross attention, larger value, more close to target # Usefull in style editing eq_params: words: ["watercolor", "painting"] values: [10,10] # Target structure-divergence hyperparames # If you change the shape of object better to use all three line, otherwise, no need. # Without following three lines, all self-attention will be replaced # Usefull in shape editing blend_words: [['jeep',], ["car",]] masked_self_attention: True # masked_latents: False # Directly copy the latents, performance not so good in our case # preserve source structure of blend_words , [0, 1] # bend_th-> [1.0, 1.0], mask -> 0, use inversion-time attention, the structure is similar to the input # bend_th-> [0.0, 0.0], mask -> 1, use more edit self-attention, more generated shape, less source acttention bend_th: [0.3, 0.3] 1: cross_replace_steps: default_: 0.5 self_replace_steps: 0.5 use_inversion_attention: True is_replace_controller: True blend_words: [['silver', 'jeep'], ["Porsche", 'car']] # for local edit. If it is not local yet - use only the source object: blend_word = ((('cat',), ("cat",))). masked_self_attention: True bend_th: [0.3, 0.3] clip_length: "${..train_dataset.n_sample_frame}" sample_seeds: [0] num_inference_steps: 50 prompt2prompt_edit: True model_config: lora: 160 test_pipeline_config: target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline num_inference_steps: "${..validation_sample_logger.num_inference_steps}" epsilon: 1e-5 train_steps: 10 seed: 0 learning_rate: 1e-5 train_temporal_conv: False guidance_scale: "${validation_sample_logger_config.guidance_scale}"