Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- checkpoints/clip-vit-base-patch32/.gitattributes +16 -0
- checkpoints/clip-vit-base-patch32/README.md +145 -0
- checkpoints/clip-vit-base-patch32/config.json +157 -0
- checkpoints/clip-vit-base-patch32/flax_model.msgpack +3 -0
- checkpoints/clip-vit-base-patch32/merges.txt +0 -0
- checkpoints/clip-vit-base-patch32/preprocessor_config.json +19 -0
- checkpoints/clip-vit-base-patch32/pytorch_model.bin +3 -0
- checkpoints/clip-vit-base-patch32/special_tokens_map.json +1 -0
- checkpoints/clip-vit-base-patch32/tf_model.h5 +3 -0
- checkpoints/clip-vit-base-patch32/tokenizer.json +0 -0
- checkpoints/clip-vit-base-patch32/tokenizer_config.json +1 -0
- checkpoints/clip-vit-base-patch32/vocab.json +0 -0
- checkpoints/output/svd/train_2026-02-07T13-19-50/config.yaml +101 -0
- checkpoints/output/svd/train_2026-02-07T13-33-11/config.yaml +101 -0
- checkpoints/stable-video-diffusion-img2vid/.gitattributes +36 -0
- checkpoints/stable-video-diffusion-img2vid/LICENSE.md +58 -0
- checkpoints/stable-video-diffusion-img2vid/README.md +88 -0
- checkpoints/stable-video-diffusion-img2vid/comparison.png +3 -0
- checkpoints/stable-video-diffusion-img2vid/feature_extractor/preprocessor_config.json +28 -0
- checkpoints/stable-video-diffusion-img2vid/image_encoder/config.json +23 -0
- checkpoints/stable-video-diffusion-img2vid/image_encoder/model.fp16.safetensors +3 -0
- checkpoints/stable-video-diffusion-img2vid/image_encoder/model.safetensors +3 -0
- checkpoints/stable-video-diffusion-img2vid/model_index.json +25 -0
- checkpoints/stable-video-diffusion-img2vid/output_tile.gif +3 -0
- checkpoints/stable-video-diffusion-img2vid/scheduler/scheduler_config.json +20 -0
- checkpoints/stable-video-diffusion-img2vid/svd.safetensors +3 -0
- checkpoints/stable-video-diffusion-img2vid/svd_image_decoder.safetensors +3 -0
- checkpoints/stable-video-diffusion-img2vid/unet/config.json +38 -0
- checkpoints/stable-video-diffusion-img2vid/unet/diffusion_pytorch_model.fp16.safetensors +3 -0
- checkpoints/stable-video-diffusion-img2vid/unet/diffusion_pytorch_model.safetensors +3 -0
- checkpoints/stable-video-diffusion-img2vid/vae/config.json +24 -0
- checkpoints/stable-video-diffusion-img2vid/vae/diffusion_pytorch_model.fp16.safetensors +3 -0
- checkpoints/stable-video-diffusion-img2vid/vae/diffusion_pytorch_model.safetensors +3 -0
- checkpoints/svd-robot/.gitattributes +35 -0
- checkpoints/svd-robot/feature_extractor/preprocessor_config.json +27 -0
- checkpoints/svd-robot/image_encoder/config.json +23 -0
- checkpoints/svd-robot/image_encoder/model.safetensors +3 -0
- checkpoints/svd-robot/model_index.json +25 -0
- checkpoints/svd-robot/scheduler/scheduler_config.json +24 -0
- checkpoints/svd-robot/unet/config.json +38 -0
- checkpoints/svd-robot/unet/diffusion_pytorch_model.safetensors +3 -0
- checkpoints/svd-robot/vae/config.json +24 -0
- checkpoints/svd-robot/vae/diffusion_pytorch_model.safetensors +3 -0
.gitattributes
CHANGED
|
@@ -126,3 +126,5 @@ aloha_robot_project/calvin/calvin_env/tacto/website/static/img/demo_rolling.gif
|
|
| 126 |
aloha_robot_project/calvin/calvin_env/tacto/website/static/img/demo_shadow.gif filter=lfs diff=lfs merge=lfs -text
|
| 127 |
aloha_robot_project/calvin/media/sensors.png filter=lfs diff=lfs merge=lfs -text
|
| 128 |
aloha_robot_project/calvin/media/teaser.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 126 |
aloha_robot_project/calvin/calvin_env/tacto/website/static/img/demo_shadow.gif filter=lfs diff=lfs merge=lfs -text
|
| 127 |
aloha_robot_project/calvin/media/sensors.png filter=lfs diff=lfs merge=lfs -text
|
| 128 |
aloha_robot_project/calvin/media/teaser.png filter=lfs diff=lfs merge=lfs -text
|
| 129 |
+
checkpoints/stable-video-diffusion-img2vid/comparison.png filter=lfs diff=lfs merge=lfs -text
|
| 130 |
+
checkpoints/stable-video-diffusion-img2vid/output_tile.gif filter=lfs diff=lfs merge=lfs -text
|
checkpoints/clip-vit-base-patch32/.gitattributes
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
checkpoints/clip-vit-base-patch32/README.md
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- vision
|
| 4 |
+
widget:
|
| 5 |
+
- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
|
| 6 |
+
candidate_labels: playing music, playing sports
|
| 7 |
+
example_title: Cat & Dog
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Model Card: CLIP
|
| 11 |
+
|
| 12 |
+
Disclaimer: The model card is taken and modified from the official CLIP repository, it can be found [here](https://github.com/openai/CLIP/blob/main/model-card.md).
|
| 13 |
+
|
| 14 |
+
## Model Details
|
| 15 |
+
|
| 16 |
+
The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within.
|
| 17 |
+
|
| 18 |
+
### Model Date
|
| 19 |
+
|
| 20 |
+
January 2021
|
| 21 |
+
|
| 22 |
+
### Model Type
|
| 23 |
+
|
| 24 |
+
The model uses a ViT-B/32 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss.
|
| 25 |
+
|
| 26 |
+
The original implementation had two variants: one using a ResNet image encoder and the other using a Vision Transformer. This repository has the variant with the Vision Transformer.
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
### Documents
|
| 30 |
+
|
| 31 |
+
- [Blog Post](https://openai.com/blog/clip/)
|
| 32 |
+
- [CLIP Paper](https://arxiv.org/abs/2103.00020)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
### Use with Transformers
|
| 36 |
+
|
| 37 |
+
```python3
|
| 38 |
+
from PIL import Image
|
| 39 |
+
import requests
|
| 40 |
+
|
| 41 |
+
from transformers import CLIPProcessor, CLIPModel
|
| 42 |
+
|
| 43 |
+
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 44 |
+
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 45 |
+
|
| 46 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 47 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
| 48 |
+
|
| 49 |
+
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
|
| 50 |
+
|
| 51 |
+
outputs = model(**inputs)
|
| 52 |
+
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
|
| 53 |
+
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
## Model Use
|
| 58 |
+
|
| 59 |
+
### Intended Use
|
| 60 |
+
|
| 61 |
+
The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis.
|
| 62 |
+
|
| 63 |
+
#### Primary intended uses
|
| 64 |
+
|
| 65 |
+
The primary intended users of these models are AI researchers.
|
| 66 |
+
|
| 67 |
+
We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models.
|
| 68 |
+
|
| 69 |
+
### Out-of-Scope Use Cases
|
| 70 |
+
|
| 71 |
+
**Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful.
|
| 72 |
+
|
| 73 |
+
Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use.
|
| 74 |
+
|
| 75 |
+
Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases.
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
## Data
|
| 80 |
+
|
| 81 |
+
The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users.
|
| 82 |
+
|
| 83 |
+
### Data Mission Statement
|
| 84 |
+
|
| 85 |
+
Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset.
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
## Performance and Limitations
|
| 90 |
+
|
| 91 |
+
### Performance
|
| 92 |
+
|
| 93 |
+
We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets:
|
| 94 |
+
|
| 95 |
+
- Food101
|
| 96 |
+
- CIFAR10
|
| 97 |
+
- CIFAR100
|
| 98 |
+
- Birdsnap
|
| 99 |
+
- SUN397
|
| 100 |
+
- Stanford Cars
|
| 101 |
+
- FGVC Aircraft
|
| 102 |
+
- VOC2007
|
| 103 |
+
- DTD
|
| 104 |
+
- Oxford-IIIT Pet dataset
|
| 105 |
+
- Caltech101
|
| 106 |
+
- Flowers102
|
| 107 |
+
- MNIST
|
| 108 |
+
- SVHN
|
| 109 |
+
- IIIT5K
|
| 110 |
+
- Hateful Memes
|
| 111 |
+
- SST-2
|
| 112 |
+
- UCF101
|
| 113 |
+
- Kinetics700
|
| 114 |
+
- Country211
|
| 115 |
+
- CLEVR Counting
|
| 116 |
+
- KITTI Distance
|
| 117 |
+
- STL-10
|
| 118 |
+
- RareAct
|
| 119 |
+
- Flickr30
|
| 120 |
+
- MSCOCO
|
| 121 |
+
- ImageNet
|
| 122 |
+
- ImageNet-A
|
| 123 |
+
- ImageNet-R
|
| 124 |
+
- ImageNet Sketch
|
| 125 |
+
- ObjectNet (ImageNet Overlap)
|
| 126 |
+
- Youtube-BB
|
| 127 |
+
- ImageNet-Vid
|
| 128 |
+
|
| 129 |
+
## Limitations
|
| 130 |
+
|
| 131 |
+
CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance.
|
| 132 |
+
|
| 133 |
+
### Bias and Fairness
|
| 134 |
+
|
| 135 |
+
We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper).
|
| 136 |
+
|
| 137 |
+
We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks.
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
## Feedback
|
| 142 |
+
|
| 143 |
+
### Where to send questions or comments about the model
|
| 144 |
+
|
| 145 |
+
Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9)
|
checkpoints/clip-vit-base-patch32/config.json
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "openai/clip-vit-base-patch32",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"CLIPModel"
|
| 5 |
+
],
|
| 6 |
+
"initializer_factor": 1.0,
|
| 7 |
+
"logit_scale_init_value": 2.6592,
|
| 8 |
+
"model_type": "clip",
|
| 9 |
+
"projection_dim": 512,
|
| 10 |
+
"text_config": {
|
| 11 |
+
"_name_or_path": "",
|
| 12 |
+
"add_cross_attention": false,
|
| 13 |
+
"architectures": null,
|
| 14 |
+
"attention_dropout": 0.0,
|
| 15 |
+
"bad_words_ids": null,
|
| 16 |
+
"bos_token_id": 0,
|
| 17 |
+
"chunk_size_feed_forward": 0,
|
| 18 |
+
"cross_attention_hidden_size": null,
|
| 19 |
+
"decoder_start_token_id": null,
|
| 20 |
+
"diversity_penalty": 0.0,
|
| 21 |
+
"do_sample": false,
|
| 22 |
+
"dropout": 0.0,
|
| 23 |
+
"early_stopping": false,
|
| 24 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 25 |
+
"eos_token_id": 2,
|
| 26 |
+
"finetuning_task": null,
|
| 27 |
+
"forced_bos_token_id": null,
|
| 28 |
+
"forced_eos_token_id": null,
|
| 29 |
+
"hidden_act": "quick_gelu",
|
| 30 |
+
"hidden_size": 512,
|
| 31 |
+
"id2label": {
|
| 32 |
+
"0": "LABEL_0",
|
| 33 |
+
"1": "LABEL_1"
|
| 34 |
+
},
|
| 35 |
+
"initializer_factor": 1.0,
|
| 36 |
+
"initializer_range": 0.02,
|
| 37 |
+
"intermediate_size": 2048,
|
| 38 |
+
"is_decoder": false,
|
| 39 |
+
"is_encoder_decoder": false,
|
| 40 |
+
"label2id": {
|
| 41 |
+
"LABEL_0": 0,
|
| 42 |
+
"LABEL_1": 1
|
| 43 |
+
},
|
| 44 |
+
"layer_norm_eps": 1e-05,
|
| 45 |
+
"length_penalty": 1.0,
|
| 46 |
+
"max_length": 20,
|
| 47 |
+
"max_position_embeddings": 77,
|
| 48 |
+
"min_length": 0,
|
| 49 |
+
"model_type": "clip_text_model",
|
| 50 |
+
"no_repeat_ngram_size": 0,
|
| 51 |
+
"num_attention_heads": 8,
|
| 52 |
+
"num_beam_groups": 1,
|
| 53 |
+
"num_beams": 1,
|
| 54 |
+
"num_hidden_layers": 12,
|
| 55 |
+
"num_return_sequences": 1,
|
| 56 |
+
"output_attentions": false,
|
| 57 |
+
"output_hidden_states": false,
|
| 58 |
+
"output_scores": false,
|
| 59 |
+
"pad_token_id": 1,
|
| 60 |
+
"prefix": null,
|
| 61 |
+
"projection_dim": 512,
|
| 62 |
+
"problem_type": null,
|
| 63 |
+
"pruned_heads": {},
|
| 64 |
+
"remove_invalid_values": false,
|
| 65 |
+
"repetition_penalty": 1.0,
|
| 66 |
+
"return_dict": true,
|
| 67 |
+
"return_dict_in_generate": false,
|
| 68 |
+
"sep_token_id": null,
|
| 69 |
+
"task_specific_params": null,
|
| 70 |
+
"temperature": 1.0,
|
| 71 |
+
"tie_encoder_decoder": false,
|
| 72 |
+
"tie_word_embeddings": true,
|
| 73 |
+
"tokenizer_class": null,
|
| 74 |
+
"top_k": 50,
|
| 75 |
+
"top_p": 1.0,
|
| 76 |
+
"torch_dtype": null,
|
| 77 |
+
"torchscript": false,
|
| 78 |
+
"transformers_version": "4.16.0.dev0",
|
| 79 |
+
"use_bfloat16": false,
|
| 80 |
+
"vocab_size": 49408
|
| 81 |
+
},
|
| 82 |
+
"text_config_dict": null,
|
| 83 |
+
"transformers_version": null,
|
| 84 |
+
"vision_config": {
|
| 85 |
+
"_name_or_path": "",
|
| 86 |
+
"add_cross_attention": false,
|
| 87 |
+
"architectures": null,
|
| 88 |
+
"attention_dropout": 0.0,
|
| 89 |
+
"bad_words_ids": null,
|
| 90 |
+
"bos_token_id": null,
|
| 91 |
+
"chunk_size_feed_forward": 0,
|
| 92 |
+
"cross_attention_hidden_size": null,
|
| 93 |
+
"decoder_start_token_id": null,
|
| 94 |
+
"diversity_penalty": 0.0,
|
| 95 |
+
"do_sample": false,
|
| 96 |
+
"dropout": 0.0,
|
| 97 |
+
"early_stopping": false,
|
| 98 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 99 |
+
"eos_token_id": null,
|
| 100 |
+
"finetuning_task": null,
|
| 101 |
+
"forced_bos_token_id": null,
|
| 102 |
+
"forced_eos_token_id": null,
|
| 103 |
+
"hidden_act": "quick_gelu",
|
| 104 |
+
"hidden_size": 768,
|
| 105 |
+
"id2label": {
|
| 106 |
+
"0": "LABEL_0",
|
| 107 |
+
"1": "LABEL_1"
|
| 108 |
+
},
|
| 109 |
+
"image_size": 224,
|
| 110 |
+
"initializer_factor": 1.0,
|
| 111 |
+
"initializer_range": 0.02,
|
| 112 |
+
"intermediate_size": 3072,
|
| 113 |
+
"is_decoder": false,
|
| 114 |
+
"is_encoder_decoder": false,
|
| 115 |
+
"label2id": {
|
| 116 |
+
"LABEL_0": 0,
|
| 117 |
+
"LABEL_1": 1
|
| 118 |
+
},
|
| 119 |
+
"layer_norm_eps": 1e-05,
|
| 120 |
+
"length_penalty": 1.0,
|
| 121 |
+
"max_length": 20,
|
| 122 |
+
"min_length": 0,
|
| 123 |
+
"model_type": "clip_vision_model",
|
| 124 |
+
"no_repeat_ngram_size": 0,
|
| 125 |
+
"num_attention_heads": 12,
|
| 126 |
+
"num_beam_groups": 1,
|
| 127 |
+
"num_beams": 1,
|
| 128 |
+
"num_hidden_layers": 12,
|
| 129 |
+
"num_return_sequences": 1,
|
| 130 |
+
"output_attentions": false,
|
| 131 |
+
"output_hidden_states": false,
|
| 132 |
+
"output_scores": false,
|
| 133 |
+
"pad_token_id": null,
|
| 134 |
+
"patch_size": 32,
|
| 135 |
+
"prefix": null,
|
| 136 |
+
"projection_dim" : 512,
|
| 137 |
+
"problem_type": null,
|
| 138 |
+
"pruned_heads": {},
|
| 139 |
+
"remove_invalid_values": false,
|
| 140 |
+
"repetition_penalty": 1.0,
|
| 141 |
+
"return_dict": true,
|
| 142 |
+
"return_dict_in_generate": false,
|
| 143 |
+
"sep_token_id": null,
|
| 144 |
+
"task_specific_params": null,
|
| 145 |
+
"temperature": 1.0,
|
| 146 |
+
"tie_encoder_decoder": false,
|
| 147 |
+
"tie_word_embeddings": true,
|
| 148 |
+
"tokenizer_class": null,
|
| 149 |
+
"top_k": 50,
|
| 150 |
+
"top_p": 1.0,
|
| 151 |
+
"torch_dtype": null,
|
| 152 |
+
"torchscript": false,
|
| 153 |
+
"transformers_version": "4.16.0.dev0",
|
| 154 |
+
"use_bfloat16": false
|
| 155 |
+
},
|
| 156 |
+
"vision_config_dict": null
|
| 157 |
+
}
|
checkpoints/clip-vit-base-patch32/flax_model.msgpack
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a2994d89bebd77abba5a554789dd9152a7e25467b79d88f6bd237d2dec5051c
|
| 3 |
+
size 605123003
|
checkpoints/clip-vit-base-patch32/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoints/clip-vit-base-patch32/preprocessor_config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": 224,
|
| 3 |
+
"do_center_crop": true,
|
| 4 |
+
"do_normalize": true,
|
| 5 |
+
"do_resize": true,
|
| 6 |
+
"feature_extractor_type": "CLIPFeatureExtractor",
|
| 7 |
+
"image_mean": [
|
| 8 |
+
0.48145466,
|
| 9 |
+
0.4578275,
|
| 10 |
+
0.40821073
|
| 11 |
+
],
|
| 12 |
+
"image_std": [
|
| 13 |
+
0.26862954,
|
| 14 |
+
0.26130258,
|
| 15 |
+
0.27577711
|
| 16 |
+
],
|
| 17 |
+
"resample": 3,
|
| 18 |
+
"size": 224
|
| 19 |
+
}
|
checkpoints/clip-vit-base-patch32/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a63082132ba4f97a80bea76823f544493bffa8082296d62d71581a4feff1576f
|
| 3 |
+
size 605247071
|
checkpoints/clip-vit-base-patch32/special_tokens_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": "<|endoftext|>"}
|
checkpoints/clip-vit-base-patch32/tf_model.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d7e64ea2c496306a4bc0a6a7ab21e4755fd8c7beb01e6d9f5dc8d6662f1bfd2
|
| 3 |
+
size 605551040
|
checkpoints/clip-vit-base-patch32/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoints/clip-vit-base-patch32/tokenizer_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": "<|endoftext|>", "add_prefix_space": false, "errors": "replace", "do_lower_case": true, "name_or_path": "./clip_ViT_B_32/", "model_max_length": 77}
|
checkpoints/clip-vit-base-patch32/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoints/output/svd/train_2026-02-07T13-19-50/config.yaml
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pretrained_model_path: /workspace/checkpoints/stable-video-diffusion-img2vid
|
| 2 |
+
output_dir: /workspace/checkpoints/output/svd
|
| 3 |
+
train_args:
|
| 4 |
+
debug: false
|
| 5 |
+
use_lora: false
|
| 6 |
+
project_name: svd_aloha
|
| 7 |
+
run_name: clean_code
|
| 8 |
+
clip_model_path: /workspace/checkpoints/clip-vit-base-patch32
|
| 9 |
+
clip_token_length: 20
|
| 10 |
+
position_encode: true
|
| 11 |
+
clip_img_size: 224
|
| 12 |
+
use_img_cond: false
|
| 13 |
+
annotation_name: annotation
|
| 14 |
+
dataset_dir: /workspace/data/robotdata
|
| 15 |
+
dataset: aloha_0050
|
| 16 |
+
prob:
|
| 17 |
+
- 1.0
|
| 18 |
+
tie_weight: true
|
| 19 |
+
normalize: true
|
| 20 |
+
pre_encode: true
|
| 21 |
+
num_workers: 16
|
| 22 |
+
video_size:
|
| 23 |
+
- 256
|
| 24 |
+
- 256
|
| 25 |
+
sequence_length: 16
|
| 26 |
+
only_one_clip: true
|
| 27 |
+
log_every: 100
|
| 28 |
+
global_seed: 12345
|
| 29 |
+
do_evaluate: false
|
| 30 |
+
evaluate_checkpoint: false
|
| 31 |
+
resume_from_checkpoint: false
|
| 32 |
+
validation_num: 32
|
| 33 |
+
video_num: 3
|
| 34 |
+
num_frames: 16
|
| 35 |
+
width: 256
|
| 36 |
+
height: 256
|
| 37 |
+
sample_preview: true
|
| 38 |
+
num_inference_steps: 30
|
| 39 |
+
guidance_scale: 7.5
|
| 40 |
+
fps: 10
|
| 41 |
+
motion_bucket_id: 127
|
| 42 |
+
decode_chunk_size: 10
|
| 43 |
+
shuffle: true
|
| 44 |
+
validation_steps: 2000
|
| 45 |
+
trainable_modules:
|
| 46 |
+
- all
|
| 47 |
+
- attn1
|
| 48 |
+
- attn2
|
| 49 |
+
- conv_in
|
| 50 |
+
- temp_conv
|
| 51 |
+
extra_unet_params: null
|
| 52 |
+
extra_text_encoder_params: null
|
| 53 |
+
train_batch_size: 6
|
| 54 |
+
max_train_steps: 10000
|
| 55 |
+
learning_rate: 5.0e-06
|
| 56 |
+
scale_lr: false
|
| 57 |
+
lr_scheduler: constant
|
| 58 |
+
lr_warmup_steps: 0
|
| 59 |
+
adam_beta1: 0.9
|
| 60 |
+
adam_beta2: 0.999
|
| 61 |
+
adam_weight_decay: 0
|
| 62 |
+
adam_epsilon: 1.0e-08
|
| 63 |
+
max_grad_norm: 1.0
|
| 64 |
+
gradient_accumulation_steps: 1
|
| 65 |
+
gradient_checkpointing: true
|
| 66 |
+
text_encoder_gradient_checkpointing: false
|
| 67 |
+
checkpointing_steps: 2000
|
| 68 |
+
resume_from_checkpoint: null
|
| 69 |
+
resume_step: null
|
| 70 |
+
mixed_precision: fp16
|
| 71 |
+
use_8bit_adam: false
|
| 72 |
+
enable_xformers_memory_efficient_attention: false
|
| 73 |
+
enable_torch_2_attn: true
|
| 74 |
+
seed: 6
|
| 75 |
+
use_offset_noise: false
|
| 76 |
+
rescale_schedule: false
|
| 77 |
+
offset_noise_strength: 0.1
|
| 78 |
+
extend_dataset: false
|
| 79 |
+
cache_latents: false
|
| 80 |
+
cached_latent_dir: null
|
| 81 |
+
save_pretrained_model: true
|
| 82 |
+
logger_type: tensorboard
|
| 83 |
+
kwargs:
|
| 84 |
+
motion_mask: false
|
| 85 |
+
motion_strength: false
|
| 86 |
+
dataset_types:
|
| 87 |
+
- video_blip
|
| 88 |
+
train_text_encoder: false
|
| 89 |
+
lora_version: cloneofsimo
|
| 90 |
+
use_unet_lora: false
|
| 91 |
+
use_text_lora: false
|
| 92 |
+
lora_unet_dropout: 0.1
|
| 93 |
+
lora_text_dropout: 0.1
|
| 94 |
+
save_lora_for_webui: true
|
| 95 |
+
only_lora_for_webui: false
|
| 96 |
+
unet_lora_modules:
|
| 97 |
+
- UNet3DConditionModel
|
| 98 |
+
text_encoder_lora_modules:
|
| 99 |
+
- CLIPEncoderLayer
|
| 100 |
+
lora_rank: 16
|
| 101 |
+
trainable_text_modules: null
|
checkpoints/output/svd/train_2026-02-07T13-33-11/config.yaml
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pretrained_model_path: /workspace/checkpoints/stable-video-diffusion-img2vid
|
| 2 |
+
output_dir: /workspace/checkpoints/output/svd
|
| 3 |
+
train_args:
|
| 4 |
+
debug: false
|
| 5 |
+
use_lora: false
|
| 6 |
+
project_name: svd_aloha
|
| 7 |
+
run_name: clean_code
|
| 8 |
+
clip_model_path: /workspace/checkpoints/clip-vit-base-patch32
|
| 9 |
+
clip_token_length: 20
|
| 10 |
+
position_encode: true
|
| 11 |
+
clip_img_size: 224
|
| 12 |
+
use_img_cond: false
|
| 13 |
+
annotation_name: annotation
|
| 14 |
+
dataset_dir: /workspace/data/robotdata
|
| 15 |
+
dataset: aloha_0050
|
| 16 |
+
prob:
|
| 17 |
+
- 1.0
|
| 18 |
+
tie_weight: true
|
| 19 |
+
normalize: true
|
| 20 |
+
pre_encode: true
|
| 21 |
+
num_workers: 16
|
| 22 |
+
video_size:
|
| 23 |
+
- 256
|
| 24 |
+
- 256
|
| 25 |
+
sequence_length: 16
|
| 26 |
+
only_one_clip: true
|
| 27 |
+
log_every: 100
|
| 28 |
+
global_seed: 12345
|
| 29 |
+
do_evaluate: false
|
| 30 |
+
evaluate_checkpoint: false
|
| 31 |
+
resume_from_checkpoint: false
|
| 32 |
+
validation_num: 32
|
| 33 |
+
video_num: 3
|
| 34 |
+
num_frames: 16
|
| 35 |
+
width: 256
|
| 36 |
+
height: 256
|
| 37 |
+
sample_preview: true
|
| 38 |
+
num_inference_steps: 30
|
| 39 |
+
guidance_scale: 7.5
|
| 40 |
+
fps: 10
|
| 41 |
+
motion_bucket_id: 127
|
| 42 |
+
decode_chunk_size: 10
|
| 43 |
+
shuffle: true
|
| 44 |
+
validation_steps: 2000
|
| 45 |
+
trainable_modules:
|
| 46 |
+
- all
|
| 47 |
+
- attn1
|
| 48 |
+
- attn2
|
| 49 |
+
- conv_in
|
| 50 |
+
- temp_conv
|
| 51 |
+
extra_unet_params: null
|
| 52 |
+
extra_text_encoder_params: null
|
| 53 |
+
train_batch_size: 6
|
| 54 |
+
max_train_steps: 10000
|
| 55 |
+
learning_rate: 5.0e-06
|
| 56 |
+
scale_lr: false
|
| 57 |
+
lr_scheduler: constant
|
| 58 |
+
lr_warmup_steps: 0
|
| 59 |
+
adam_beta1: 0.9
|
| 60 |
+
adam_beta2: 0.999
|
| 61 |
+
adam_weight_decay: 0
|
| 62 |
+
adam_epsilon: 1.0e-08
|
| 63 |
+
max_grad_norm: 1.0
|
| 64 |
+
gradient_accumulation_steps: 1
|
| 65 |
+
gradient_checkpointing: true
|
| 66 |
+
text_encoder_gradient_checkpointing: false
|
| 67 |
+
checkpointing_steps: 2000
|
| 68 |
+
resume_from_checkpoint: null
|
| 69 |
+
resume_step: null
|
| 70 |
+
mixed_precision: fp16
|
| 71 |
+
use_8bit_adam: false
|
| 72 |
+
enable_xformers_memory_efficient_attention: false
|
| 73 |
+
enable_torch_2_attn: true
|
| 74 |
+
seed: 6
|
| 75 |
+
use_offset_noise: false
|
| 76 |
+
rescale_schedule: false
|
| 77 |
+
offset_noise_strength: 0.1
|
| 78 |
+
extend_dataset: false
|
| 79 |
+
cache_latents: false
|
| 80 |
+
cached_latent_dir: null
|
| 81 |
+
save_pretrained_model: true
|
| 82 |
+
logger_type: tensorboard
|
| 83 |
+
kwargs:
|
| 84 |
+
motion_mask: false
|
| 85 |
+
motion_strength: false
|
| 86 |
+
dataset_types:
|
| 87 |
+
- video_blip
|
| 88 |
+
train_text_encoder: false
|
| 89 |
+
lora_version: cloneofsimo
|
| 90 |
+
use_unet_lora: false
|
| 91 |
+
use_text_lora: false
|
| 92 |
+
lora_unet_dropout: 0.1
|
| 93 |
+
lora_text_dropout: 0.1
|
| 94 |
+
save_lora_for_webui: true
|
| 95 |
+
only_lora_for_webui: false
|
| 96 |
+
unet_lora_modules:
|
| 97 |
+
- UNet3DConditionModel
|
| 98 |
+
text_encoder_lora_modules:
|
| 99 |
+
- CLIPEncoderLayer
|
| 100 |
+
lora_rank: 16
|
| 101 |
+
trainable_text_modules: null
|
checkpoints/stable-video-diffusion-img2vid/.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
output_tile.gif filter=lfs diff=lfs merge=lfs -text
|
checkpoints/stable-video-diffusion-img2vid/LICENSE.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
STABILITY AI COMMUNITY LICENSE AGREEMENT
|
| 2 |
+
|
| 3 |
+
Last Updated: July 5, 2024
|
| 4 |
+
|
| 5 |
+
1. INTRODUCTION
|
| 6 |
+
|
| 7 |
+
This Agreement applies to any individual person or entity (“You”, “Your” or “Licensee”) that uses or distributes any portion or element of the Stability AI Materials or Derivative Works thereof for any Research & Non-Commercial or Commercial purpose. Capitalized terms not otherwise defined herein are defined in Section V below.
|
| 8 |
+
|
| 9 |
+
This Agreement is intended to allow research, non-commercial, and limited commercial uses of the Models free of charge. In order to ensure that certain limited commercial uses of the Models continue to be allowed, this Agreement preserves free access to the Models for people or organizations generating annual revenue of less than US $1,000,000 (or local currency equivalent).
|
| 10 |
+
|
| 11 |
+
By clicking “I Accept” or by using or distributing or using any portion or element of the Stability Materials or Derivative Works, You agree that You have read, understood and are bound by the terms of this Agreement. If You are acting on behalf of a company, organization or other entity, then “You” includes you and that entity, and You agree that You: (i) are an authorized representative of such entity with the authority to bind such entity to this Agreement, and (ii) You agree to the terms of this Agreement on that entity’s behalf.
|
| 12 |
+
|
| 13 |
+
2. RESEARCH & NON-COMMERCIAL USE LICENSE
|
| 14 |
+
|
| 15 |
+
Subject to the terms of this Agreement, Stability AI grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Stability AI’s intellectual property or other rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Stability AI Materials for any Research or Non-Commercial Purpose. “Research Purpose” means academic or scientific advancement, and in each case, is not primarily intended for commercial advantage or monetary compensation to You or others. “Non-Commercial Purpose” means any purpose other than a Research Purpose that is not primarily intended for commercial advantage or monetary compensation to You or others, such as personal use (i.e., hobbyist) or evaluation and testing.
|
| 16 |
+
|
| 17 |
+
3. COMMERCIAL USE LICENSE
|
| 18 |
+
|
| 19 |
+
Subject to the terms of this Agreement (including the remainder of this Section III), Stability AI grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Stability AI’s intellectual property or other rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Stability AI Materials for any Commercial Purpose. “Commercial Purpose” means any purpose other than a Research Purpose or Non-Commercial Purpose that is primarily intended for commercial advantage or monetary compensation to You or others, including but not limited to, (i) creating, modifying, or distributing Your product or service, including via a hosted service or application programming interface, and (ii) for Your business’s or organization’s internal operations.
|
| 20 |
+
If You are using or distributing the Stability AI Materials for a Commercial Purpose, You must register with Stability AI at (https://stability.ai/community-license). If at any time You or Your Affiliate(s), either individually or in aggregate, generate more than USD $1,000,000 in annual revenue (or the equivalent thereof in Your local currency), regardless of whether that revenue is generated directly or indirectly from the Stability AI Materials or Derivative Works, any licenses granted to You under this Agreement shall terminate as of such date. You must request a license from Stability AI at (https://stability.ai/enterprise) , which Stability AI may grant to You in its sole discretion. If you receive Stability AI Materials, or any Derivative Works thereof, from a Licensee as part of an integrated end user product, then Section III of this Agreement will not apply to you.
|
| 21 |
+
|
| 22 |
+
4. GENERAL TERMS
|
| 23 |
+
|
| 24 |
+
Your Research, Non-Commercial, and Commercial License(s) under this Agreement are subject to the following terms.
|
| 25 |
+
a. Distribution & Attribution. If You distribute or make available the Stability AI Materials or a Derivative Work to a third party, or a product or service that uses any portion of them, You shall: (i) provide a copy of this Agreement to that third party, (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This Stability AI Model is licensed under the Stability AI Community License, Copyright © Stability AI Ltd. All Rights Reserved”, and (iii) prominently display “Powered by Stability AI” on a related website, user interface, blogpost, about page, or product documentation. If You create a Derivative Work, You may add your own attribution notice(s) to the “Notice” text file included with that Derivative Work, provided that You clearly indicate which attributions apply to the Stability AI Materials and state in the “Notice” text file that You changed the Stability AI Materials and how it was modified.
|
| 26 |
+
b. Use Restrictions. Your use of the Stability AI Materials and Derivative Works, including any output or results of the Stability AI Materials or Derivative Works, must comply with applicable laws and regulations (including Trade Control Laws and equivalent regulations) and adhere to the Documentation and Stability AI’s AUP, which is hereby incorporated by reference. Furthermore, You will not use the Stability AI Materials or Derivative Works, or any output or results of the Stability AI Materials or Derivative Works, to create or improve any foundational generative AI model (excluding the Models or Derivative Works).
|
| 27 |
+
c. Intellectual Property.
|
| 28 |
+
(i) Trademark License. No trademark licenses are granted under this Agreement, and in connection with the Stability AI Materials or Derivative Works, You may not use any name or mark owned by or associated with Stability AI or any of its Affiliates, except as required under Section IV(a) herein.
|
| 29 |
+
(ii) Ownership of Derivative Works. As between You and Stability AI, You are the owner of Derivative Works You create, subject to Stability AI’s ownership of the Stability AI Materials and any Derivative Works made by or for Stability AI.
|
| 30 |
+
(iii) Ownership of Outputs. As between You and Stability AI, You own any outputs generated from the Models or Derivative Works to the extent permitted by applicable law.
|
| 31 |
+
(iv) Disputes. If You or Your Affiliate(s) institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Stability AI Materials, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by You, then any licenses granted to You under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to Your use or distribution of the Stability AI Materials or Derivative Works in violation of this Agreement.
|
| 32 |
+
(v) Feedback. From time to time, You may provide Stability AI with verbal and/or written suggestions, comments or other feedback related to Stability AI’s existing or prospective technology, products or services (collectively, “Feedback”). You are not obligated to provide Stability AI with Feedback, but to the extent that You do, You hereby grant Stability AI a perpetual, irrevocable, royalty-free, fully-paid, sub-licensable, transferable, non-exclusive, worldwide right and license to exploit the Feedback in any manner without restriction. Your Feedback is provided “AS IS” and You make no warranties whatsoever about any Feedback.
|
| 33 |
+
d. Disclaimer Of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE STABILITY AI MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OR LAWFULNESS OF USING OR REDISTRIBUTING THE STABILITY AI MATERIALS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE STABILITY AI MATERIALS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS.
|
| 34 |
+
e. Limitation Of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
|
| 35 |
+
f. Term And Termination. The term of this Agreement will commence upon Your acceptance of this Agreement or access to the Stability AI Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if You are in breach of any term or condition of this Agreement. Upon termination of this Agreement, You shall delete and cease use of any Stability AI Materials or Derivative Works. Section IV(d), (e), and (g) shall survive the termination of this Agreement.
|
| 36 |
+
g. Governing Law. This Agreement will be governed by and constructed in accordance with the laws of the United States and the State of California without regard to choice of law principles, and the UN Convention on Contracts for International Sale of Goods does not apply to this Agreement.
|
| 37 |
+
|
| 38 |
+
5. DEFINITIONS
|
| 39 |
+
|
| 40 |
+
“Affiliate(s)” means any entity that directly or indirectly controls, is controlled by, or is under common control with the subject entity; for purposes of this definition, “control” means direct or indirect ownership or control of more than 50% of the voting interests of the subject entity.
|
| 41 |
+
|
| 42 |
+
"Agreement" means this Stability AI Community License Agreement.
|
| 43 |
+
|
| 44 |
+
“AUP” means the Stability AI Acceptable Use Policy available at (https://stability.ai/use-policy), as may be updated from time to time.
|
| 45 |
+
|
| 46 |
+
"Derivative Work(s)” means (a) any derivative work of the Stability AI Materials as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model’s output, including “fine tune” and “low-rank adaptation” models derived from a Model or a Model’s output, but do not include the output of any Model.
|
| 47 |
+
|
| 48 |
+
“Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software or Models.
|
| 49 |
+
|
| 50 |
+
“Model(s)" means, collectively, Stability AI’s proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing listed on Stability’s Core Models Webpage available at (https://stability.ai/core-models), as may be updated from time to time.
|
| 51 |
+
|
| 52 |
+
"Stability AI" or "we" means Stability AI Ltd. and its Affiliates.
|
| 53 |
+
|
| 54 |
+
"Software" means Stability AI’s proprietary software made available under this Agreement now or in the future.
|
| 55 |
+
|
| 56 |
+
“Stability AI Materials” means, collectively, Stability’s proprietary Models, Software and Documentation (and any portion or combination thereof) made available under this Agreement.
|
| 57 |
+
|
| 58 |
+
“Trade Control Laws” means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.
|
checkpoints/stable-video-diffusion-img2vid/README.md
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
pipeline_tag: image-to-video
|
| 3 |
+
license: other
|
| 4 |
+
license_name: stable-video-diffusion-community
|
| 5 |
+
license_link: LICENSE.md
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Stable Video Diffusion Image-to-Video Model Card
|
| 9 |
+
|
| 10 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 11 |
+

|
| 12 |
+
Stable Video Diffusion (SVD) Image-to-Video is a diffusion model that takes in a still image as a conditioning frame, and generates a video from it.
|
| 13 |
+
|
| 14 |
+
Please note: For commercial use of this model, please refer to https://stability.ai/license.
|
| 15 |
+
|
| 16 |
+
## Model Details
|
| 17 |
+
|
| 18 |
+
### Model Description
|
| 19 |
+
|
| 20 |
+
(SVD) Image-to-Video is a latent diffusion model trained to generate short video clips from an image conditioning.
|
| 21 |
+
This model was trained to generate 14 frames at resolution 576x1024 given a context frame of the same size.
|
| 22 |
+
We also finetune the widely used [f8-decoder](https://huggingface.co/docs/diffusers/api/models/autoencoderkl#loading-from-the-original-format) for temporal consistency.
|
| 23 |
+
For convenience, we additionally provide the model with the
|
| 24 |
+
standard frame-wise decoder [here](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid/blob/main/svd_image_decoder.safetensors).
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** Stability AI
|
| 28 |
+
- **Funded by:** Stability AI
|
| 29 |
+
- **Model type:** Generative image-to-video model
|
| 30 |
+
|
| 31 |
+
### Model Sources
|
| 32 |
+
|
| 33 |
+
For research purposes, we recommend our `generative-models` Github repository (https://github.com/Stability-AI/generative-models),
|
| 34 |
+
which implements the most popular diffusion frameworks (both training and inference).
|
| 35 |
+
|
| 36 |
+
- **Repository:** https://github.com/Stability-AI/generative-models
|
| 37 |
+
- **Paper:** https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets
|
| 38 |
+
|
| 39 |
+
## Evaluation
|
| 40 |
+

|
| 41 |
+
The chart above evaluates user preference for SVD-Image-to-Video over [GEN-2](https://research.runwayml.com/gen2) and [PikaLabs](https://www.pika.art/).
|
| 42 |
+
SVD-Image-to-Video is preferred by human voters in terms of video quality. For details on the user study, we refer to the [research paper](https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets)
|
| 43 |
+
|
| 44 |
+
## Uses
|
| 45 |
+
|
| 46 |
+
### Direct Use
|
| 47 |
+
|
| 48 |
+
The model is intended for research purposes only. Possible research areas and tasks include
|
| 49 |
+
|
| 50 |
+
- Research on generative models.
|
| 51 |
+
- Safe deployment of models which have the potential to generate harmful content.
|
| 52 |
+
- Probing and understanding the limitations and biases of generative models.
|
| 53 |
+
- Generation of artworks and use in design and other artistic processes.
|
| 54 |
+
- Applications in educational or creative tools.
|
| 55 |
+
|
| 56 |
+
Excluded uses are described below.
|
| 57 |
+
|
| 58 |
+
### Out-of-Scope Use
|
| 59 |
+
|
| 60 |
+
The model was not trained to be factual or true representations of people or events,
|
| 61 |
+
and therefore using the model to generate such content is out-of-scope for the abilities of this model.
|
| 62 |
+
The model should not be used in any way that violates Stability AI's [Acceptable Use Policy](https://stability.ai/use-policy).
|
| 63 |
+
|
| 64 |
+
## Limitations and Bias
|
| 65 |
+
|
| 66 |
+
### Limitations
|
| 67 |
+
- The generated videos are rather short (<= 4sec), and the model does not achieve perfect photorealism.
|
| 68 |
+
- The model may generate videos without motion, or very slow camera pans.
|
| 69 |
+
- The model cannot be controlled through text.
|
| 70 |
+
- The model cannot render legible text.
|
| 71 |
+
- Faces and people in general may not be generated properly.
|
| 72 |
+
- The autoencoding part of the model is lossy.
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
### Recommendations
|
| 76 |
+
|
| 77 |
+
The model is intended for research purposes only.
|
| 78 |
+
|
| 79 |
+
## How to Get Started with the Model
|
| 80 |
+
|
| 81 |
+
Check out https://github.com/Stability-AI/generative-models
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# Appendix:
|
| 86 |
+
All considered potential data sources were included for final training, with none held out as the proposed data filtering methods described in the SVD paper handle the quality control/filtering of the dataset. With regards to safety/NSFW filtering, sources considered were either deemed safe or filtered with the in-house NSFW filters. No explicit human labor is involved in training data preparation. However, human evaluation for model outputs and quality was extensively used to evaluate model quality and performance. The evaluations were performed with third-party contractor platforms (Amazon Sagemaker, Amazon Mechanical Turk, Prolific) with fluent English-speaking contractors from various countries, primarily from the USA, UK, and Canada. Each worker was paid $12/hr for the time invested in the evaluation. No other third party was involved in the development of this model; the model was fully developed in-house at Stability AI. Training the SVD checkpoints required a total of approximately 200,000 A100 80GB hours. The majority of the training occurred on 48 * 8 A100s, while some stages took more/less than that. The resulting CO2 emission is ~19,000kg CO2 eq., and energy consumed is ~64000 kWh. The released checkpoints (SVD/SVD-XT) are image-to-video models that generate short videos/animations closely following the given input image. Since the model relies on an existing supplied image, the potential risks of disclosing specific material or novel unsafe content are minimal. This was also evaluated by third-party independent red-teaming services, which agree with our conclusion to a high degree of confidence (>90% in various areas of safety red-teaming). The external evaluations were also performed for trustworthiness, leading to >95% confidence in real, trustworthy videos. With the default settings at the time of release, SVD takes ~100s for generation, and SVD-XT takes ~180s on an A100 80GB card. Several optimizations to trade off quality / memory / speed can be done to perform faster inference or inference on lower VRAM cards. The information related to the model and its development process and usage protocols can be found in the GitHub repo, associated research paper, and HuggingFace model page/cards. The released model inference & demo code has image-level watermarking enabled by default, which can be used to detect the outputs. This is done via the imWatermark Python library.
|
| 87 |
+
The model can be used to generate videos from static initial images. However, we prohibit unlawful, obscene, or misleading uses of the model consistent with the terms of our license and Acceptable Use Policy. For the open-weights release, our training data filtering mitigations alleviate this risk to some extent. These restrictions are explicitly enforced on user-facing interfaces at stablevideo.com, where a warning is issued. We do not take any responsibility for third-party interfaces. Submitting initial images that bypass input filters to tease out offensive or inappropriate content listed above is also prohibited. Safety filtering checks at stablevideo.com run on model inputs and outputs independently. More details on our user-facing interfaces can be found here: https://www.stablevideo.com/faq. Beyond the Acceptable Use Policy and other mitigations and conditions described here, the model is not subject to additional model behavior interventions of the type described in the Foundation Model Transparency Index.
|
| 88 |
+
For stablevideo.com, we store preference data in the form of upvotes/downvotes on user-generated videos, and we have a pairwise ranker that runs while a user generates videos. This usage data is solely used for improving Stability AI’s future image/video models and services. No other third-party entities are given access to the usage data beyond Stability AI and maintainers of stablevideo.com. For usage statistics of SVD, we refer interested users to HuggingFace model download/usage statistics as a primary indicator. Third-party applications also have reported model usage statistics. We might also consider releasing aggregate usage statistics of stablevideo.com on reaching some milestones.
|
checkpoints/stable-video-diffusion-img2vid/comparison.png
ADDED
|
Git LFS Details
|
checkpoints/stable-video-diffusion-img2vid/feature_extractor/preprocessor_config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": {
|
| 3 |
+
"height": 224,
|
| 4 |
+
"width": 224
|
| 5 |
+
},
|
| 6 |
+
"do_center_crop": true,
|
| 7 |
+
"do_convert_rgb": true,
|
| 8 |
+
"do_normalize": true,
|
| 9 |
+
"do_rescale": true,
|
| 10 |
+
"do_resize": true,
|
| 11 |
+
"feature_extractor_type": "CLIPFeatureExtractor",
|
| 12 |
+
"image_mean": [
|
| 13 |
+
0.48145466,
|
| 14 |
+
0.4578275,
|
| 15 |
+
0.40821073
|
| 16 |
+
],
|
| 17 |
+
"image_processor_type": "CLIPImageProcessor",
|
| 18 |
+
"image_std": [
|
| 19 |
+
0.26862954,
|
| 20 |
+
0.26130258,
|
| 21 |
+
0.27577711
|
| 22 |
+
],
|
| 23 |
+
"resample": 3,
|
| 24 |
+
"rescale_factor": 0.00392156862745098,
|
| 25 |
+
"size": {
|
| 26 |
+
"shortest_edge": 224
|
| 27 |
+
}
|
| 28 |
+
}
|
checkpoints/stable-video-diffusion-img2vid/image_encoder/config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "/home/suraj_huggingface_co/.cache/huggingface/hub/models--diffusers--svd-test/snapshots/b9d5dcd269e2f7bff9f98a4907b8c69b7acd555d/image_encoder",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"CLIPVisionModelWithProjection"
|
| 5 |
+
],
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"dropout": 0.0,
|
| 8 |
+
"hidden_act": "gelu",
|
| 9 |
+
"hidden_size": 1280,
|
| 10 |
+
"image_size": 224,
|
| 11 |
+
"initializer_factor": 1.0,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 5120,
|
| 14 |
+
"layer_norm_eps": 1e-05,
|
| 15 |
+
"model_type": "clip_vision_model",
|
| 16 |
+
"num_attention_heads": 16,
|
| 17 |
+
"num_channels": 3,
|
| 18 |
+
"num_hidden_layers": 32,
|
| 19 |
+
"patch_size": 14,
|
| 20 |
+
"projection_dim": 1024,
|
| 21 |
+
"torch_dtype": "float16",
|
| 22 |
+
"transformers_version": "4.34.0.dev0"
|
| 23 |
+
}
|
checkpoints/stable-video-diffusion-img2vid/image_encoder/model.fp16.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae616c24393dd1854372b0639e5541666f7521cbe219669255e865cb7f89466a
|
| 3 |
+
size 1264217240
|
checkpoints/stable-video-diffusion-img2vid/image_encoder/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed1e5af7b4042ca30ec29999a4a5cfcac90b7fb610fd05ace834f2dcbb763eab
|
| 3 |
+
size 2528371296
|
checkpoints/stable-video-diffusion-img2vid/model_index.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "StableVideoDiffusionPipeline",
|
| 3 |
+
"_diffusers_version": "0.24.0.dev0",
|
| 4 |
+
"_name_or_path": "diffusers/svd-test",
|
| 5 |
+
"feature_extractor": [
|
| 6 |
+
"transformers",
|
| 7 |
+
"CLIPImageProcessor"
|
| 8 |
+
],
|
| 9 |
+
"image_encoder": [
|
| 10 |
+
"transformers",
|
| 11 |
+
"CLIPVisionModelWithProjection"
|
| 12 |
+
],
|
| 13 |
+
"scheduler": [
|
| 14 |
+
"diffusers",
|
| 15 |
+
"EulerDiscreteScheduler"
|
| 16 |
+
],
|
| 17 |
+
"unet": [
|
| 18 |
+
"diffusers",
|
| 19 |
+
"UNetSpatioTemporalConditionModel"
|
| 20 |
+
],
|
| 21 |
+
"vae": [
|
| 22 |
+
"diffusers",
|
| 23 |
+
"AutoencoderKLTemporalDecoder"
|
| 24 |
+
]
|
| 25 |
+
}
|
checkpoints/stable-video-diffusion-img2vid/output_tile.gif
ADDED
|
Git LFS Details
|
checkpoints/stable-video-diffusion-img2vid/scheduler/scheduler_config.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "EulerDiscreteScheduler",
|
| 3 |
+
"_diffusers_version": "0.24.0.dev0",
|
| 4 |
+
"beta_end": 0.012,
|
| 5 |
+
"beta_schedule": "scaled_linear",
|
| 6 |
+
"beta_start": 0.00085,
|
| 7 |
+
"clip_sample": false,
|
| 8 |
+
"interpolation_type": "linear",
|
| 9 |
+
"num_train_timesteps": 1000,
|
| 10 |
+
"prediction_type": "v_prediction",
|
| 11 |
+
"set_alpha_to_one": false,
|
| 12 |
+
"sigma_max": 700.0,
|
| 13 |
+
"sigma_min": 0.002,
|
| 14 |
+
"skip_prk_steps": true,
|
| 15 |
+
"steps_offset": 1,
|
| 16 |
+
"timestep_spacing": "leading",
|
| 17 |
+
"timestep_type": "continuous",
|
| 18 |
+
"trained_betas": null,
|
| 19 |
+
"use_karras_sigmas": true
|
| 20 |
+
}
|
checkpoints/stable-video-diffusion-img2vid/svd.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3e0994626df395a3831de024f11b2d9d241143bb6f16e2efbacced248aa18ce0
|
| 3 |
+
size 9559625980
|
checkpoints/stable-video-diffusion-img2vid/svd_image_decoder.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f454a3bdb92e0bc2aae634146208605f209c5a3d37f5bb87fbec8e5cca44dc18
|
| 3 |
+
size 9503252964
|
checkpoints/stable-video-diffusion-img2vid/unet/config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "UNetSpatioTemporalConditionModel",
|
| 3 |
+
"_diffusers_version": "0.24.0.dev0",
|
| 4 |
+
"_name_or_path": "/home/suraj_huggingface_co/.cache/huggingface/hub/models--diffusers--svd-test/snapshots/b9d5dcd269e2f7bff9f98a4907b8c69b7acd555d/unet",
|
| 5 |
+
"addition_time_embed_dim": 256,
|
| 6 |
+
"block_out_channels": [
|
| 7 |
+
320,
|
| 8 |
+
640,
|
| 9 |
+
1280,
|
| 10 |
+
1280
|
| 11 |
+
],
|
| 12 |
+
"cross_attention_dim": 1024,
|
| 13 |
+
"down_block_types": [
|
| 14 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 15 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 16 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 17 |
+
"DownBlockSpatioTemporal"
|
| 18 |
+
],
|
| 19 |
+
"in_channels": 8,
|
| 20 |
+
"layers_per_block": 2,
|
| 21 |
+
"num_attention_heads": [
|
| 22 |
+
5,
|
| 23 |
+
10,
|
| 24 |
+
20,
|
| 25 |
+
20
|
| 26 |
+
],
|
| 27 |
+
"num_frames": 14,
|
| 28 |
+
"out_channels": 4,
|
| 29 |
+
"projection_class_embeddings_input_dim": 768,
|
| 30 |
+
"sample_size": 96,
|
| 31 |
+
"transformer_layers_per_block": 1,
|
| 32 |
+
"up_block_types": [
|
| 33 |
+
"UpBlockSpatioTemporal",
|
| 34 |
+
"CrossAttnUpBlockSpatioTemporal",
|
| 35 |
+
"CrossAttnUpBlockSpatioTemporal",
|
| 36 |
+
"CrossAttnUpBlockSpatioTemporal"
|
| 37 |
+
]
|
| 38 |
+
}
|
checkpoints/stable-video-diffusion-img2vid/unet/diffusion_pytorch_model.fp16.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb552818963e736506c6693ffc279d59df423d63aced38902ea4373ab1fd2932
|
| 3 |
+
size 3049435868
|
checkpoints/stable-video-diffusion-img2vid/unet/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98c5e6b99df6bef015b2681c0f8ab9d4c807b733be46c067d6c9966101698f58
|
| 3 |
+
size 6098682464
|
checkpoints/stable-video-diffusion-img2vid/vae/config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "AutoencoderKLTemporalDecoder",
|
| 3 |
+
"_diffusers_version": "0.24.0.dev0",
|
| 4 |
+
"_name_or_path": "/home/suraj_huggingface_co/.cache/huggingface/hub/models--diffusers--svd-test/snapshots/b9d5dcd269e2f7bff9f98a4907b8c69b7acd555d/vae",
|
| 5 |
+
"block_out_channels": [
|
| 6 |
+
128,
|
| 7 |
+
256,
|
| 8 |
+
512,
|
| 9 |
+
512
|
| 10 |
+
],
|
| 11 |
+
"down_block_types": [
|
| 12 |
+
"DownEncoderBlock2D",
|
| 13 |
+
"DownEncoderBlock2D",
|
| 14 |
+
"DownEncoderBlock2D",
|
| 15 |
+
"DownEncoderBlock2D"
|
| 16 |
+
],
|
| 17 |
+
"force_upcast": true,
|
| 18 |
+
"in_channels": 3,
|
| 19 |
+
"latent_channels": 4,
|
| 20 |
+
"layers_per_block": 2,
|
| 21 |
+
"out_channels": 3,
|
| 22 |
+
"sample_size": 768,
|
| 23 |
+
"scaling_factor": 0.18215
|
| 24 |
+
}
|
checkpoints/stable-video-diffusion-img2vid/vae/diffusion_pytorch_model.fp16.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af602cd0eb4ad6086ec94fbf1438dfb1be5ec9ac03fd0215640854e90d6463a3
|
| 3 |
+
size 195531910
|
checkpoints/stable-video-diffusion-img2vid/vae/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9975042d7bee021bd53a72b1af14c8627d624f6547ec9abe661b68b962b88c49
|
| 3 |
+
size 391017740
|
checkpoints/svd-robot/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
checkpoints/svd-robot/feature_extractor/preprocessor_config.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": {
|
| 3 |
+
"height": 224,
|
| 4 |
+
"width": 224
|
| 5 |
+
},
|
| 6 |
+
"do_center_crop": true,
|
| 7 |
+
"do_convert_rgb": true,
|
| 8 |
+
"do_normalize": true,
|
| 9 |
+
"do_rescale": true,
|
| 10 |
+
"do_resize": true,
|
| 11 |
+
"image_mean": [
|
| 12 |
+
0.48145466,
|
| 13 |
+
0.4578275,
|
| 14 |
+
0.40821073
|
| 15 |
+
],
|
| 16 |
+
"image_processor_type": "CLIPImageProcessor",
|
| 17 |
+
"image_std": [
|
| 18 |
+
0.26862954,
|
| 19 |
+
0.26130258,
|
| 20 |
+
0.27577711
|
| 21 |
+
],
|
| 22 |
+
"resample": 3,
|
| 23 |
+
"rescale_factor": 0.00392156862745098,
|
| 24 |
+
"size": {
|
| 25 |
+
"shortest_edge": 224
|
| 26 |
+
}
|
| 27 |
+
}
|
checkpoints/svd-robot/image_encoder/config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "/cephfs/cjyyj/code/video_robot_svd/output/svd/train_2025-04-12T11-47-20/checkpoint-80000/image_encoder",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"CLIPVisionModelWithProjection"
|
| 5 |
+
],
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"dropout": 0.0,
|
| 8 |
+
"hidden_act": "gelu",
|
| 9 |
+
"hidden_size": 1280,
|
| 10 |
+
"image_size": 224,
|
| 11 |
+
"initializer_factor": 1.0,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 5120,
|
| 14 |
+
"layer_norm_eps": 1e-05,
|
| 15 |
+
"model_type": "clip_vision_model",
|
| 16 |
+
"num_attention_heads": 16,
|
| 17 |
+
"num_channels": 3,
|
| 18 |
+
"num_hidden_layers": 32,
|
| 19 |
+
"patch_size": 14,
|
| 20 |
+
"projection_dim": 1024,
|
| 21 |
+
"torch_dtype": "float32",
|
| 22 |
+
"transformers_version": "4.47.0"
|
| 23 |
+
}
|
checkpoints/svd-robot/image_encoder/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed1e5af7b4042ca30ec29999a4a5cfcac90b7fb610fd05ace834f2dcbb763eab
|
| 3 |
+
size 2528371296
|
checkpoints/svd-robot/model_index.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "StableVideoDiffusionPipeline",
|
| 3 |
+
"_diffusers_version": "0.32.2",
|
| 4 |
+
"_name_or_path": "/cephfs/cjyyj/code/video_robot_svd/output/svd/train_2025-04-12T11-47-20/checkpoint-80000",
|
| 5 |
+
"feature_extractor": [
|
| 6 |
+
"transformers",
|
| 7 |
+
"CLIPImageProcessor"
|
| 8 |
+
],
|
| 9 |
+
"image_encoder": [
|
| 10 |
+
"transformers",
|
| 11 |
+
"CLIPVisionModelWithProjection"
|
| 12 |
+
],
|
| 13 |
+
"scheduler": [
|
| 14 |
+
"diffusers",
|
| 15 |
+
"EulerDiscreteScheduler"
|
| 16 |
+
],
|
| 17 |
+
"unet": [
|
| 18 |
+
"diffusers",
|
| 19 |
+
"UNetSpatioTemporalConditionModel"
|
| 20 |
+
],
|
| 21 |
+
"vae": [
|
| 22 |
+
"diffusers",
|
| 23 |
+
"AutoencoderKLTemporalDecoder"
|
| 24 |
+
]
|
| 25 |
+
}
|
checkpoints/svd-robot/scheduler/scheduler_config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "EulerDiscreteScheduler",
|
| 3 |
+
"_diffusers_version": "0.32.2",
|
| 4 |
+
"beta_end": 0.012,
|
| 5 |
+
"beta_schedule": "scaled_linear",
|
| 6 |
+
"beta_start": 0.00085,
|
| 7 |
+
"clip_sample": false,
|
| 8 |
+
"final_sigmas_type": "zero",
|
| 9 |
+
"interpolation_type": "linear",
|
| 10 |
+
"num_train_timesteps": 1000,
|
| 11 |
+
"prediction_type": "v_prediction",
|
| 12 |
+
"rescale_betas_zero_snr": false,
|
| 13 |
+
"set_alpha_to_one": false,
|
| 14 |
+
"sigma_max": 700.0,
|
| 15 |
+
"sigma_min": 0.002,
|
| 16 |
+
"skip_prk_steps": true,
|
| 17 |
+
"steps_offset": 1,
|
| 18 |
+
"timestep_spacing": "leading",
|
| 19 |
+
"timestep_type": "continuous",
|
| 20 |
+
"trained_betas": null,
|
| 21 |
+
"use_beta_sigmas": false,
|
| 22 |
+
"use_exponential_sigmas": false,
|
| 23 |
+
"use_karras_sigmas": true
|
| 24 |
+
}
|
checkpoints/svd-robot/unet/config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "UNetSpatioTemporalConditionModel",
|
| 3 |
+
"_diffusers_version": "0.32.2",
|
| 4 |
+
"_name_or_path": "/cephfs/cjyyj/code/video_robot_svd/output/svd/train_2025-04-12T11-47-20/checkpoint-80000/unet",
|
| 5 |
+
"addition_time_embed_dim": 256,
|
| 6 |
+
"block_out_channels": [
|
| 7 |
+
320,
|
| 8 |
+
640,
|
| 9 |
+
1280,
|
| 10 |
+
1280
|
| 11 |
+
],
|
| 12 |
+
"cross_attention_dim": 1024,
|
| 13 |
+
"down_block_types": [
|
| 14 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 15 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 16 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 17 |
+
"DownBlockSpatioTemporal"
|
| 18 |
+
],
|
| 19 |
+
"in_channels": 8,
|
| 20 |
+
"layers_per_block": 2,
|
| 21 |
+
"num_attention_heads": [
|
| 22 |
+
5,
|
| 23 |
+
10,
|
| 24 |
+
20,
|
| 25 |
+
20
|
| 26 |
+
],
|
| 27 |
+
"num_frames": 14,
|
| 28 |
+
"out_channels": 4,
|
| 29 |
+
"projection_class_embeddings_input_dim": 768,
|
| 30 |
+
"sample_size": 96,
|
| 31 |
+
"transformer_layers_per_block": 1,
|
| 32 |
+
"up_block_types": [
|
| 33 |
+
"UpBlockSpatioTemporal",
|
| 34 |
+
"CrossAttnUpBlockSpatioTemporal",
|
| 35 |
+
"CrossAttnUpBlockSpatioTemporal",
|
| 36 |
+
"CrossAttnUpBlockSpatioTemporal"
|
| 37 |
+
]
|
| 38 |
+
}
|
checkpoints/svd-robot/unet/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e3f7a04e7dbab1c18ab5798ba2c0120322a13fe9962d98789bc7a803c037750
|
| 3 |
+
size 6098682464
|
checkpoints/svd-robot/vae/config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "AutoencoderKLTemporalDecoder",
|
| 3 |
+
"_diffusers_version": "0.32.2",
|
| 4 |
+
"_name_or_path": "/cephfs/cjyyj/code/video_robot_svd/output/svd/train_2025-04-12T11-47-20/checkpoint-80000/vae",
|
| 5 |
+
"block_out_channels": [
|
| 6 |
+
128,
|
| 7 |
+
256,
|
| 8 |
+
512,
|
| 9 |
+
512
|
| 10 |
+
],
|
| 11 |
+
"down_block_types": [
|
| 12 |
+
"DownEncoderBlock2D",
|
| 13 |
+
"DownEncoderBlock2D",
|
| 14 |
+
"DownEncoderBlock2D",
|
| 15 |
+
"DownEncoderBlock2D"
|
| 16 |
+
],
|
| 17 |
+
"force_upcast": true,
|
| 18 |
+
"in_channels": 3,
|
| 19 |
+
"latent_channels": 4,
|
| 20 |
+
"layers_per_block": 2,
|
| 21 |
+
"out_channels": 3,
|
| 22 |
+
"sample_size": 768,
|
| 23 |
+
"scaling_factor": 0.18215
|
| 24 |
+
}
|
checkpoints/svd-robot/vae/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9975042d7bee021bd53a72b1af14c8627d624f6547ec9abe661b68b962b88c49
|
| 3 |
+
size 391017740
|