adamirus commited on
Commit
12d8157
1 Parent(s): 7ffaab0

Upload folder using huggingface_hub

Browse files
__pycache__/app.cpython-310.pyc CHANGED
Binary files a/__pycache__/app.cpython-310.pyc and b/__pycache__/app.cpython-310.pyc differ
 
app.py CHANGED
@@ -4,7 +4,7 @@ from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
4
  from diffusers.utils import export_to_video
5
  import os
6
 
7
-
8
  def generate_video(prompt):
9
  # load pipeline
10
  pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16,
 
4
  from diffusers.utils import export_to_video
5
  import os
6
 
7
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
8
  def generate_video(prompt):
9
  # load pipeline
10
  pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16,
aps.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download
2
+
3
+ from modelscope.pipelines import pipeline
4
+ from modelscope.outputs import OutputKeys
5
+ import pathlib
6
+ import gradio as gr
7
+
8
+
9
+ def video_gen(prompt):
10
+ model_dir = pathlib.Path('weights')
11
+ snapshot_download('damo-vilab/modelscope-damo-text-to-video-synthesis',
12
+ repo_type='model', local_dir=model_dir)
13
+
14
+ pipe = pipeline('text-to-video-synthesis', model_dir.as_posix())
15
+ prompt = {
16
+ 'text': 'A panda eating bamboo on a rock.',
17
+ }
18
+ output_video_path = pipe(prompt,)[OutputKeys.OUTPUT_VIDEO]
19
+ return output_video_path
20
+
21
+ demo = gr.Interface(fn=video_gen, inputs="text", outputs="video")
22
+ demo.launch(share=True)
weights/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
weights/README.md ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ pipeline_tag: text-to-video
4
+ ---
5
+
6
+ The original repo is [here](https://modelscope.cn/models/damo/text-to-video-synthesis/summary).
7
+
8
+ **We Are Hiring!** (Based in Beijing / Hangzhou, China.)
9
+
10
+ If you're looking for an exciting challenge and the opportunity to work with cutting-edge technologies in AIGC and large-scale pretraining, then we are the place for you. We are looking for talented, motivated and creative individuals to join our team. If you are interested, please send your CV to us.
11
+
12
+ EMAIL: yingya.zyy@alibaba-inc.com
13
+
14
+ This model is based on a multi-stage text-to-video generation diffusion model, which inputs a description text and returns a video that matches the text description. Only English input is supported.
15
+
16
+ ## Model Description
17
+
18
+ The text-to-video generation diffusion model consists of three sub-networks: text feature extraction, text feature-to-video latent space diffusion model, and video latent space to video visual space. The overall model parameters are about 1.7 billion. Support English input. The diffusion model adopts the Unet3D structure, and realizes the function of video generation through the iterative denoising process from the pure Gaussian noise video.
19
+
20
+ **This model is meant for research purposes. Please look at the [model limitations and biases](#model-limitations-and-biases) and [misuse, malicious use and excessive use](#misuse-malicious-use-and-excessive-use) sections.**
21
+
22
+ **How to expect the model to be used and where it is applicable**
23
+
24
+ This model has a wide range of applications and can reason and generate videos based on arbitrary English text descriptions.
25
+
26
+ ## How to use
27
+
28
+
29
+ The model has been launched on [ModelScope Studio](https://modelscope.cn/studios/damo/text-to-video-synthesis/summary) and [huggingface](https://huggingface.co/spaces/damo-vilab/modelscope-text-to-video-synthesis), you can experience it directly; you can also refer to [Colab page](https://colab.research.google.com/drive/1uW1ZqswkQ9Z9bp5Nbo5z59cAn7I0hE6R?usp=sharing#scrollTo=bSluBq99ObSk) to build it yourself.
30
+ In order to facilitate the experience of the model, users can refer to the [Aliyun Notebook Tutorial](https://modelscope.cn/headlines/detail/26) to quickly develop this Text-to-Video model.
31
+
32
+ This demo requires about 16GB CPU RAM and 16GB GPU RAM. Under the ModelScope framework, the current model can be used by calling a simple Pipeline, where the input must be in dictionary format, the legal key value is 'text', and the content is a short text. This model currently only supports inference on the GPU. Enter specific code examples as follows:
33
+
34
+
35
+ ### Operating environment (Python Package)
36
+
37
+ ```
38
+ pip install modelscope==1.4.2
39
+ pip install open_clip_torch
40
+ pip install pytorch-lightning
41
+ ```
42
+
43
+ ### Code example (Demo Code)
44
+
45
+ ```python
46
+ from huggingface_hub import snapshot_download
47
+
48
+ from modelscope.pipelines import pipeline
49
+ from modelscope.outputs import OutputKeys
50
+ import pathlib
51
+
52
+ model_dir = pathlib.Path('weights')
53
+ snapshot_download('damo-vilab/modelscope-damo-text-to-video-synthesis',
54
+ repo_type='model', local_dir=model_dir)
55
+
56
+ pipe = pipeline('text-to-video-synthesis', model_dir.as_posix())
57
+ test_text = {
58
+ 'text': 'A panda eating bamboo on a rock.',
59
+ }
60
+ output_video_path = pipe(test_text,)[OutputKeys.OUTPUT_VIDEO]
61
+ print('output_video_path:', output_video_path)
62
+ ```
63
+
64
+ ### View results
65
+
66
+ The above code will display the save path of the output video, and the current encoding format can be played normally with [VLC player](https://www.videolan.org/vlc/).
67
+
68
+ The output mp4 file can be viewed by [VLC media player](https://www.videolan.org/vlc/). Some other media players may not view it normally.
69
+
70
+ ## Model limitations and biases
71
+
72
+ * The model is trained based on public data sets such as Webvid, and the generated results may have deviations related to the distribution of training data.
73
+ * This model cannot achieve perfect film and television quality generation.
74
+ * The model cannot generate clear text.
75
+ * The model is mainly trained with English corpus and does not support other languages ​​at the moment**.
76
+ * The performance of this model needs to be improved on complex compositional generation tasks.
77
+
78
+ ## Misuse, Malicious Use and Excessive Use
79
+
80
+ * The model was not trained to realistically represent people or events, so using it to generate such content is beyond the model's capabilities.
81
+ * It is prohibited to generate content that is demeaning or harmful to people or their environment, culture, religion, etc.
82
+ * Prohibited for pornographic, violent and bloody content generation.
83
+ * Prohibited for error and false information generation.
84
+
85
+ ## Training data
86
+
87
+ The training data includes [LAION5B](https://huggingface.co/datasets/laion/laion2B-en), [ImageNet](https://www.image-net.org/), [Webvid](https://m-bain.github.io/webvid-dataset/) and other public datasets. Image and video filtering is performed after pre-training such as aesthetic score, watermark score, and deduplication.
88
+
89
+ ## Citation
90
+
91
+ ```bibtex
92
+ @InProceedings{VideoFusion,
93
+ author = {Luo, Zhengxiong and Chen, Dayou and Zhang, Yingya and Huang, Yan and Wang, Liang and Shen, Yujun and Zhao, Deli and Zhou, Jingren and Tan, Tieniu},
94
+ title = {VideoFusion: Decomposed Diffusion Models for High-Quality Video Generation},
95
+ booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
96
+ month = {June},
97
+ year = {2023}
98
+ }
99
+ ```
weights/VQGAN_autoencoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88ecb782561455673c4b78d05093494b9c539fc6bfc08f3a9a4a0dd7b0b10f36
3
+ size 5214865159
weights/configuration.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ { "framework": "pytorch",
2
+ "task": "text-to-video-synthesis",
3
+ "model": {
4
+ "type": "latent-text-to-video-synthesis",
5
+ "model_args": {
6
+ "ckpt_clip": "open_clip_pytorch_model.bin",
7
+ "ckpt_unet": "text2video_pytorch_model.pth",
8
+ "ckpt_autoencoder": "VQGAN_autoencoder.pth",
9
+ "max_frames": 16,
10
+ "tiny_gpu": 1
11
+ },
12
+ "model_cfg": {
13
+ "unet_in_dim": 4,
14
+ "unet_dim": 320,
15
+ "unet_y_dim": 768,
16
+ "unet_context_dim": 1024,
17
+ "unet_out_dim": 4,
18
+ "unet_dim_mult": [1, 2, 4, 4],
19
+ "unet_num_heads": 8,
20
+ "unet_head_dim": 64,
21
+ "unet_res_blocks": 2,
22
+ "unet_attn_scales": [1, 0.5, 0.25],
23
+ "unet_dropout": 0.1,
24
+ "temporal_attention": "True",
25
+ "num_timesteps": 1000,
26
+ "mean_type": "eps",
27
+ "var_type": "fixed_small",
28
+ "loss_type": "mse"
29
+ }
30
+ },
31
+ "pipeline": {
32
+ "type": "latent-text-to-video-synthesis"
33
+ }
34
+ }
weights/text2video_pytorch_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e139fb08e50a6c072127d9ecef4fe8e91dbdf24edad23a4e1a7c569f0ca3488
3
+ size 5645549049