Priya commited on
Commit
ac28c1d
1 Parent(s): 243fc4f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ samples/sample-20/A[[:space:]]360[[:space:]]degree[[:space:]]view[[:space:]]video[[:space:]]of[[:space:]]a[[:space:]]street[[:space:]]with[[:space:]]Skies,[[:space:]]Walls,[[:space:]]Roads,[[:space:]]Poles,[[:space:]]Buildings,[[:space:]]Riders,[[:space:]]Sidewalks,[[:space:]]People,[[:space:]]Traffic[[:space:]]signs,[[:space:]]Bicycles[[:space:]]and[[:space:]]Cars.gif filter=lfs diff=lfs merge=lfs -text
37
+ samples/sample-20.gif filter=lfs diff=lfs merge=lfs -text
38
+ samples/sample-40/A[[:space:]]360[[:space:]]degree[[:space:]]view[[:space:]]video[[:space:]]of[[:space:]]a[[:space:]]street[[:space:]]with[[:space:]]Skies,[[:space:]]Walls,[[:space:]]Roads,[[:space:]]Poles,[[:space:]]Buildings,[[:space:]]Riders,[[:space:]]Sidewalks,[[:space:]]People,[[:space:]]Traffic[[:space:]]signs,[[:space:]]Bicycles[[:space:]]and[[:space:]]Cars.gif filter=lfs diff=lfs merge=lfs -text
39
+ samples/sample-40.gif filter=lfs diff=lfs merge=lfs -text
checkpoint-50/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:823e7c857aaec80325e9fdc5d488821c0346043b2f57df8b3445d0398be9d00b
3
+ size 3636579464
checkpoint-50/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb16121741adb95c7a5872fca47a9523dee35009fc4494408ca4576518884bcd
3
+ size 388426170
checkpoint-50/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:244ef55caaf247d394ead109bbb2f345221d9f612bbbb1c51673c08c864fa15c
3
+ size 14408
checkpoint-50/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bce7091777d309ad0996ba841e0cacd9295db515ae8e6e24f546718eefe0d105
3
+ size 988
checkpoint-50/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eff638987896b6247a36bc4cf7d83275d39274b18e306de495ba63836fa54b2
3
+ size 1000
config.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pretrained_model_path: CompVis/stable-diffusion-v1-4
2
+ output_dir: /content/Tune-A-Video/outputs/city
3
+ train_data:
4
+ video_path: /content/Tune-A-Video/data/city.mp4
5
+ prompt: A 360 degree view video of a street with Skies, Walls, Roads, Poles, Buildings,
6
+ Riders, Sidewalks, People, Traffic signs, Bicycles and Cars
7
+ n_sample_frames: 24
8
+ width: 512
9
+ height: 512
10
+ sample_start_idx: 0
11
+ sample_frame_rate: 1
12
+ validation_data:
13
+ prompts:
14
+ - A 360 degree view video of a street with Skies, Walls, Roads, Poles, Buildings,
15
+ Riders, Sidewalks, People, Traffic signs, Bicycles and Cars
16
+ video_length: 24
17
+ width: 512
18
+ height: 512
19
+ num_inference_steps: 50
20
+ guidance_scale: 12.5
21
+ use_inv_latent: true
22
+ num_inv_steps: 50
23
+ validation_steps: 20
24
+ trainable_modules:
25
+ - attn1.to_q
26
+ - attn2.to_q
27
+ - attn_temp
28
+ train_batch_size: 1
29
+ max_train_steps: 50
30
+ learning_rate: 3.0e-05
31
+ scale_lr: false
32
+ lr_scheduler: constant
33
+ lr_warmup_steps: 0
34
+ adam_beta1: 0.9
35
+ adam_beta2: 0.999
36
+ adam_weight_decay: 0.01
37
+ adam_epsilon: 1.0e-08
38
+ max_grad_norm: 1.0
39
+ gradient_accumulation_steps: 1
40
+ gradient_checkpointing: true
41
+ checkpointing_steps: 50
42
+ resume_from_checkpoint: null
43
+ mixed_precision: fp16
44
+ use_8bit_adam: false
45
+ enable_xformers_memory_efficient_attention: true
46
+ seed: 33
inv_latents/ddim_latent-20.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1137750130d83adf8360f743538841ae38cb932fed8fefcd32355c33b02519ca
3
+ size 787647
inv_latents/ddim_latent-40.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fff8b9c3a6af1cf9830bc8ee4b21a440d1c5a055a93e5ce223c079cab7a2f6b
3
+ size 787647
model_index.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "TuneAVideoPipeline",
3
+ "_diffusers_version": "0.11.1",
4
+ "scheduler": [
5
+ "diffusers",
6
+ "PNDMScheduler"
7
+ ],
8
+ "text_encoder": [
9
+ "transformers",
10
+ "CLIPTextModel"
11
+ ],
12
+ "tokenizer": [
13
+ "transformers",
14
+ "CLIPTokenizer"
15
+ ],
16
+ "unet": [
17
+ "models",
18
+ "UNet3DConditionModel"
19
+ ],
20
+ "vae": [
21
+ "diffusers",
22
+ "AutoencoderKL"
23
+ ]
24
+ }
samples/sample-20.gif ADDED

Git LFS Details

  • SHA256: ce7d4432d666f18fed75da956aec40bcae2463c390163428612d39302426cbc5
  • Pointer size: 132 Bytes
  • Size of remote file: 3.39 MB
samples/sample-20/A 360 degree view video of a street with Skies, Walls, Roads, Poles, Buildings, Riders, Sidewalks, People, Traffic signs, Bicycles and Cars.gif ADDED

Git LFS Details

  • SHA256: c5f650bfea9560a1ce2c0785746bbf703e424b04af0986bfbac6464f8c1c8a3a
  • Pointer size: 132 Bytes
  • Size of remote file: 3.39 MB
samples/sample-40.gif ADDED

Git LFS Details

  • SHA256: ded9813915ec7f5132131a57bdd2be9ad76d8efbb89fbc483951d061d4c62bb2
  • Pointer size: 132 Bytes
  • Size of remote file: 2.98 MB
samples/sample-40/A 360 degree view video of a street with Skies, Walls, Roads, Poles, Buildings, Riders, Sidewalks, People, Traffic signs, Bicycles and Cars.gif ADDED

Git LFS Details

  • SHA256: 752a1ec9af69f4617f94db95c78b8fbf044e7cdebe92cb651ce3607cf80be143
  • Pointer size: 132 Bytes
  • Size of remote file: 2.98 MB
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PNDMScheduler",
3
+ "_diffusers_version": "0.11.1",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "num_train_timesteps": 1000,
9
+ "prediction_type": "epsilon",
10
+ "set_alpha_to_one": false,
11
+ "skip_prk_steps": true,
12
+ "steps_offset": 1,
13
+ "trained_betas": null
14
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "CompVis/stable-diffusion-v1-4",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "quick_gelu",
11
+ "hidden_size": 768,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 512,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.26.0",
24
+ "vocab_size": 49408
25
+ }
text_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87c1b379e54b3c87654e7661109bd1315019c3b357cf6237544002abf76d1258
3
+ size 246189278
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "do_lower_case": true,
12
+ "eos_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "errors": "replace",
21
+ "model_max_length": 77,
22
+ "name_or_path": "/root/.cache/huggingface/diffusers/models--CompVis--stable-diffusion-v1-4/snapshots/133a221b8aa7292a167afc5127cb63fb5005638b/tokenizer",
23
+ "pad_token": "<|endoftext|>",
24
+ "special_tokens_map_file": "./special_tokens_map.json",
25
+ "tokenizer_class": "CLIPTokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet3DConditionModel",
3
+ "_diffusers_version": "0.11.1",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "center_input_sample": false,
13
+ "class_embed_type": null,
14
+ "cross_attention_dim": 768,
15
+ "down_block_types": [
16
+ "CrossAttnDownBlock3D",
17
+ "CrossAttnDownBlock3D",
18
+ "CrossAttnDownBlock3D",
19
+ "DownBlock3D"
20
+ ],
21
+ "downsample_padding": 1,
22
+ "dual_cross_attention": false,
23
+ "flip_sin_to_cos": true,
24
+ "freq_shift": 0,
25
+ "in_channels": 4,
26
+ "layers_per_block": 2,
27
+ "mid_block_scale_factor": 1,
28
+ "mid_block_type": "UNetMidBlock3DCrossAttn",
29
+ "norm_eps": 1e-05,
30
+ "norm_num_groups": 32,
31
+ "num_class_embeds": null,
32
+ "only_cross_attention": false,
33
+ "out_channels": 4,
34
+ "resnet_time_scale_shift": "default",
35
+ "sample_size": 64,
36
+ "up_block_types": [
37
+ "UpBlock3D",
38
+ "CrossAttnUpBlock3D",
39
+ "CrossAttnUpBlock3D",
40
+ "CrossAttnUpBlock3D"
41
+ ],
42
+ "upcast_attention": false,
43
+ "use_linear_projection": false
44
+ }
unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf5dfef082dcf563f76b1bac7d9949fbeb6f723d4bf31e7fe08cf50f1c3f663f
3
+ size 3636825494
vae/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.11.1",
4
+ "_name_or_path": "CompVis/stable-diffusion-v1-4",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "layers_per_block": 2,
21
+ "norm_num_groups": 32,
22
+ "out_channels": 3,
23
+ "sample_size": 512,
24
+ "scaling_factor": 0.18215,
25
+ "up_block_types": [
26
+ "UpDecoderBlock2D",
27
+ "UpDecoderBlock2D",
28
+ "UpDecoderBlock2D",
29
+ "UpDecoderBlock2D"
30
+ ]
31
+ }
vae/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95bf483e6eadb4a0d0b0968b36156445ce4dfcf5961910b0a9a5a2e10c8a715b
3
+ size 167408066