{"train_file": "train.json", "validation_file": "valid.json", "test_file": "test.json", "num_examples": -1, "fea_encoder_name": "openai/clip-vit-large-patch14", "scheduler_name": "stabilityai/stable-diffusion-2-1", "unet_model_name": null, "unet_model_config": "configs/diffusion_model_config_large_2048.json", "hf_model": null, "snr_gamma": 5.0, "freeze_text_encoder": true, "text_column": "captions", "image_column": "img", "audio_column": "audio_file", "video_column": "feature_file", "augment": false, "uncondition": false, "prefix": null, "per_device_train_batch_size": 2, "per_device_eval_batch_size": 2, "learning_rate": 6e-05, "weight_decay": 1e-08, "num_train_epochs": 80, "max_train_steps": null, "gradient_accumulation_steps": 32, "lr_scheduler_type": "linear", "num_warmup_steps": 300, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_weight_decay": 0.01, "adam_epsilon": 1e-08, "output_dir": "saved/vta_ldm_clip4clip_large", "seed": null, "checkpointing_steps": "best", "save_every": 1, "resume_from_checkpoint": null, "vae_model": "ckpt/vta-ldm-clip4clip-v-large/audioldm-s-full.ckpt", "with_tracking": false, "embedding_dim": 2048, "sample_rate": 16000, "report_to": "all", "task": "video2audio", "img_pretrained_model_path": "None"} {"epoch": [1], "step": 372, "train_loss": 0.8671, "val_loss": 0.823} {"epoch": [2], "step": 744, "train_loss": 0.7656, "val_loss": 0.7662} {"epoch": [3], "step": 1116, "train_loss": 0.7378, "val_loss": 0.7439} {"epoch": [4], "step": 1488, "train_loss": 0.7264, "val_loss": 0.7197} {"epoch": [5], "step": 1860, "train_loss": 0.7146, "val_loss": 0.7098} {"epoch": [6], "step": 2232, "train_loss": 0.7067, "val_loss": 0.7075} {"epoch": [7], "step": 2604, "train_loss": 0.6999, "val_loss": 0.7001} {"epoch": [8], "step": 2976, "train_loss": 0.695, "val_loss": 0.6944} {"epoch": [9], "step": 3348, "train_loss": 0.6952, "val_loss": 0.6912} {"epoch": [10], "step": 3720, "train_loss": 0.6946, "val_loss": 0.6975} {"epoch": [11], "step": 4092, "train_loss": 0.6885, "val_loss": 0.6864} {"epoch": [12], "step": 4464, "train_loss": 0.6861, "val_loss": 0.6831} {"epoch": [13], "step": 4836, "train_loss": 0.6889, "val_loss": 0.682} {"epoch": [14], "step": 5208, "train_loss": 0.6884, "val_loss": 0.6832} {"epoch": [15], "step": 5580, "train_loss": 0.6842, "val_loss": 0.678} {"epoch": [16], "step": 5952, "train_loss": 0.681, "val_loss": 0.6783} {"epoch": [17], "step": 6324, "train_loss": 0.6807, "val_loss": 0.6789} {"epoch": [18], "step": 6696, "train_loss": 0.6771, "val_loss": 0.6733} {"epoch": [19], "step": 7068, "train_loss": 0.681, "val_loss": 0.6726} {"epoch": [20], "step": 7440, "train_loss": 0.6747, "val_loss": 0.6744} {"epoch": [21], "step": 7812, "train_loss": 0.6755, "val_loss": 0.6705} {"epoch": [22], "step": 8184, "train_loss": 0.6739, "val_loss": 0.6693} {"epoch": [23], "step": 8556, "train_loss": 0.672, "val_loss": 0.6694} {"epoch": [24], "step": 8928, "train_loss": 0.6713, "val_loss": 0.6669} {"epoch": [25], "step": 9300, "train_loss": 0.6698, "val_loss": 0.6674} {"epoch": [26], "step": 9672, "train_loss": 0.6814, "val_loss": 0.6683} {"epoch": [27], "step": 10044, "train_loss": 0.6703, "val_loss": 0.6649} {"epoch": [28], "step": 10416, "train_loss": 0.668, "val_loss": 0.6665} {"epoch": [29], "step": 10788, "train_loss": 0.6671, "val_loss": 0.6638} {"epoch": [30], "step": 11160, "train_loss": 0.676, "val_loss": 0.663} {"epoch": [31], "step": 11532, "train_loss": 0.6664, "val_loss": 0.6619} {"epoch": [32], "step": 11904, "train_loss": 0.6739, "val_loss": 0.6619} {"epoch": [33], "step": 12276, "train_loss": 0.6626, "val_loss": 0.662} {"epoch": [34], "step": 12648, "train_loss": 0.6658, "val_loss": 0.6603} {"epoch": [35], "step": 13020, "train_loss": 0.6632, "val_loss": 0.661} {"epoch": [36], "step": 13392, "train_loss": 0.7038, "val_loss": 0.6691} {"epoch": [37], "step": 13764, "train_loss": 0.6658, "val_loss": 0.6631} {"epoch": [38], "step": 14136, "train_loss": 0.666, "val_loss": 0.663} {"epoch": [39], "step": 14508, "train_loss": 0.6639, "val_loss": 0.6598} {"epoch": [40], "step": 14880, "train_loss": 0.6624, "val_loss": 0.6585} {"epoch": [41], "step": 15252, "train_loss": 0.6648, "val_loss": 0.6582} {"epoch": [42], "step": 15624, "train_loss": 0.662, "val_loss": 0.6602} {"epoch": [43], "step": 15996, "train_loss": 0.6617, "val_loss": 0.6578} {"epoch": [44], "step": 16368, "train_loss": 0.6614, "val_loss": 0.6575} {"epoch": [45], "step": 16740, "train_loss": 0.6617, "val_loss": 0.657} {"epoch": [46], "step": 17112, "train_loss": 0.6573, "val_loss": 0.6571} {"epoch": [47], "step": 17484, "train_loss": 0.6594, "val_loss": 0.6613} {"epoch": [48], "step": 17856, "train_loss": 0.66, "val_loss": 0.657} {"epoch": [49], "step": 18228, "train_loss": 0.6593, "val_loss": 0.6575} {"epoch": [50], "step": 18600, "train_loss": 0.662, "val_loss": 0.6566} {"epoch": [51], "step": 18972, "train_loss": 0.6591, "val_loss": 0.6572} {"epoch": [52], "step": 19344, "train_loss": 0.6677, "val_loss": 0.6553} {"epoch": [53], "step": 19716, "train_loss": 0.6589, "val_loss": 0.6559} {"epoch": [54], "step": 20088, "train_loss": 0.6578, "val_loss": 0.6544} {"epoch": [55], "step": 20460, "train_loss": 0.6547, "val_loss": 0.6551} {"epoch": [56], "step": 20832, "train_loss": 0.6557, "val_loss": 0.6564} {"epoch": [57], "step": 21204, "train_loss": 0.6577, "val_loss": 0.6538} {"epoch": [58], "step": 21576, "train_loss": 0.6601, "val_loss": 0.6532} {"epoch": [59], "step": 21948, "train_loss": 0.6572, "val_loss": 0.6531} {"epoch": [60], "step": 22320, "train_loss": 0.6749, "val_loss": 0.6543} {"epoch": [61], "step": 22692, "train_loss": 0.6567, "val_loss": 0.653} {"epoch": [62], "step": 23064, "train_loss": 0.656, "val_loss": 0.6528} {"epoch": [63], "step": 23436, "train_loss": 0.6559, "val_loss": 0.653} {"epoch": [64], "step": 23808, "train_loss": 0.6571, "val_loss": 0.6517} {"epoch": [65], "step": 24180, "train_loss": 0.6543, "val_loss": 0.6513} {"epoch": [66], "step": 24552, "train_loss": 0.6563, "val_loss": 0.6517} {"epoch": [67], "step": 24924, "train_loss": 0.6561, "val_loss": 0.6515} {"epoch": [68], "step": 25296, "train_loss": 0.6546, "val_loss": 0.6514} {"epoch": [69], "step": 25668, "train_loss": 0.6556, "val_loss": 0.6517} {"epoch": [70], "step": 26040, "train_loss": 0.6569, "val_loss": 0.6527} {"epoch": [71], "step": 26412, "train_loss": 0.6549, "val_loss": 0.6511} {"epoch": [72], "step": 26784, "train_loss": 0.6545, "val_loss": 0.6514} {"epoch": [73], "step": 27156, "train_loss": 0.656, "val_loss": 0.6498} {"epoch": [74], "step": 27528, "train_loss": 0.6533, "val_loss": 0.6511} {"epoch": [75], "step": 27900, "train_loss": 0.6517, "val_loss": 0.6496} {"epoch": [76], "step": 28272, "train_loss": 0.6531, "val_loss": 0.6493} {"epoch": [77], "step": 28644, "train_loss": 0.649, "val_loss": 0.649} {"epoch": [78], "step": 29016, "train_loss": 0.651, "val_loss": 0.6495} {"epoch": [79], "step": 29388, "train_loss": 0.6531, "val_loss": 0.6497} {"epoch": [80], "step": 29760, "train_loss": 0.6522, "val_loss": 0.6486}