zphilip48 commited on
Commit
4bbeb03
1 Parent(s): b27507f

update config1.1 content

Browse files
config1.1/config.json CHANGED
@@ -27,6 +27,6 @@
27
  ],
28
  "patch_size": 4,
29
  "torch_dtype": "float32",
30
- "transformers_version": "4.29.2",
31
  "window_size": 7
32
  }
 
27
  ],
28
  "patch_size": 4,
29
  "torch_dtype": "float32",
30
+ "transformers_version": "4.34.1",
31
  "window_size": 7
32
  }
config1.1/config.yaml CHANGED
@@ -1,9 +1,9 @@
1
- resume_from_checkpoint_path: '/home/ubuntu/notebook/deeplearning/labs/nougat/result/nougat/1.0/epoch=3-step=87876.ckpt'
2
  result_path: 'result'
3
  model_path: None
4
  dataset_paths:
5
- - '/home/ubuntu/notebook/nougat/nougat-dataset/train.jsonl'
6
- tokenizer: './config/tokenizer.json'
7
  exp_name: 'nougat'
8
  train_batch_sizes:
9
  - 1
 
1
+ resume_from_checkpoint_path: '/root/autodl-tmp/nougat-latex/config1.1/epoch=9-step=219690.ckpt'
2
  result_path: 'result'
3
  model_path: None
4
  dataset_paths:
5
+ - '/root/autodl-tmp/train.jsonl'
6
+ tokenizer: '/root/autodl-tmp/nougat-latex/config1.1/tokenizer.json'
7
  exp_name: 'nougat'
8
  train_batch_sizes:
9
  - 1
config1.1/epoch=10-step=286760.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b214ca5f367bc4e758c63da93ce2ef81cb4757b737b2acb34b35890d697ffb6
3
+ size 2575546580
config1.1/epoch=11-step=353830.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df4ec22f0509b2f222af7f530f986e8231f8009b0d34b11764c93f9b637739e5
3
+ size 2575546452
config1.1/epoch=9-step=219690.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b81dfd93734814d95963e3d26c4da1c23ca98f9f7eeccb2110ee25f5df439344
3
+ size 2575567208
config1.1/last.ckpt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:445894d99452498ff147ff3b365f078db97cfec21e65ed7aede54086e3b656f5
3
- size 1298137088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c94aa2ef6ef6ad7aeaf9eaa7058bdee954726fc94891e4b3b7fbfbbc31aa7e8
3
+ size 2575546644
config1.1/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb5234cae82cf10eabd5bef34db9f6661524c74ad30074ef32c382e692f0512f
3
  size 867125489
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6665d4944ab4e9026f5e613234fd1799886b2bf953065a0663cbf93ebd2169a
3
  size 867125489
config1.1/tokenizer_config.json CHANGED
@@ -1,5 +1,202 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "clean_up_tokenization_spaces": true,
 
 
3
  "model_max_length": 1000000000000000019884624838656,
4
- "tokenizer_class": "PreTrainedTokenizerFast"
 
 
 
 
 
 
 
 
5
  }
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[START_REF]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[END_REF]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[IMAGE]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "7": {
60
+ "content": "<fragments>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "8": {
68
+ "content": "</fragments>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "9": {
76
+ "content": "<work>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "10": {
84
+ "content": "</work>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "11": {
92
+ "content": "[START_SUP]",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "12": {
100
+ "content": "[END_SUP]",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "13": {
108
+ "content": "[START_SUB]",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "14": {
116
+ "content": "[END_SUB]",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "15": {
124
+ "content": "[START_DNA]",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "16": {
132
+ "content": "[END_DNA]",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "17": {
140
+ "content": "[START_AMINO]",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "18": {
148
+ "content": "[END_AMINO]",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "19": {
156
+ "content": "[START_SMILES]",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "20": {
164
+ "content": "[END_SMILES]",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "21": {
172
+ "content": "[START_I_SMILES]",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "22": {
180
+ "content": "[END_I_SMILES]",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ }
187
+ },
188
+ "bos_token": "<s>",
189
  "clean_up_tokenization_spaces": true,
190
+ "eos_token": "</s>",
191
+ "max_length": 4096,
192
  "model_max_length": 1000000000000000019884624838656,
193
+ "pad_to_multiple_of": null,
194
+ "pad_token": "<pad>",
195
+ "pad_token_type_id": 0,
196
+ "padding_side": "right",
197
+ "stride": 0,
198
+ "tokenizer_class": "PreTrainedTokenizerFast",
199
+ "truncation_side": "right",
200
+ "truncation_strategy": "longest_first",
201
+ "unk_token": "<unk>"
202
  }
config2/config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/home/ubuntu/notebook/data1/nougat/nougat",
3
  "align_long_axis": false,
4
  "architectures": [
5
  "NougatModel"
 
1
  {
2
+ "_name_or_path": "/fsx-llm/lblecher/checkpoints/nougat/perturb/20230426_125023",
3
  "align_long_axis": false,
4
  "architectures": [
5
  "NougatModel"
output/downloaded_paper_cd5d570ea4d9429bb2afbeb23dc1e159.mmd ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dual-Stream Diffusion Net for Text-to-Video Generation
2
+
3
+ Binhui Liu\({}^{1}\), Xin Liu\({}^{2}\), Anbo Dai\({}^{2}\), Zhiyong Zeng\({}^{1}\),
4
+
5
+ **Zhen Cui\({}^{1}\), Jian Yang\({}^{3}\) \({}^{1}\)**Nanjing University of Science and Technology, Nanjing, China
6
+
7
+ \({}^{2}\)SeetaCloud, Nanjing, China
8
+
9
+ \({}^{3}\)Nankai University, Tianjin, China
10
+
11
+ {lbhasura, zhiyong.zeng, zhen.cui}@njust.edu.cn,
12
+
13
+ {xin.liu, anbo.dai}@seetacloud.com, csjyang@nankai.edu.cn
14
+
15
+ ###### Abstract
16
+
17
+ With the emerging diffusion models, recently, text-to-video generation has aroused increasing attention. But an important bottleneck therein is that generative videos often tend to carry some flickers and artifacts. In this work, we propose a dual-stream diffusion net (DSDN) to improve the consistency of content variations in generating videos. In particular, the designed two diffusion streams, video content and motion branches, could not only run separately in their private spaces for producing personalized video variations as well as content, but also be well-aligned between the content and motion domains through leveraging our designed cross-transformer interaction module, which would benefit the smoothness of generated videos. Besides, we also introduce motion decomposer and combiner to faciliate the operation on video motion. Qualitative and quantitative experiments demonstrate that our method could produce amazing continuous videos with fewer flickers (see Fig. 1)1.
18
+
19
+ Footnote 1: Please see the videos in supplementary material, and more info including code could be found in the anonymous website: [https://anonymous.4open.science/r/Private-C3E8](https://anonymous.4open.science/r/Private-C3E8)
20
+
21
+ ## Introduction
22
+
23
+ In the realm of artificial intelligence generated content, one of the most exciting and challenging tasks is the transformation from text into visual content. This task not only benefits for our understanding of natural language processing but also promotes computer vision techniques. Meantime, it will cause immense potential applications in entertainment, advertising, education, and surveillance. Over the past few years, there has been substantial progress in developing models that convert textual descriptions into images [19, 20, 27]. In contrast to images, videos could carry/express richer content. For textual descriptions, videos can capture and convey intricate narratives well therein [21, 22]. Recently, the text-to-video generation has been
24
+
25
+ Figure 1: Samples generated by our method.
26
+
27
+ paid an increasing attention, specifically accompanying with the rise of diffusion models.
28
+
29
+ One critical challenge lies in reasonable and continuous content generation in spatial-temporal domain for videos, not only spatial content learnt by image-based diffusion models. Presently, the existing methods [1, 13, 14] of text-to-video generation primarily focused on reproducing visual content from text. Due to the insufficiency in modeling video dynamics, their generated videos often contain many flickers and are intermittent in visual effects. To address this issue, in this work, our goal is to increase the consistency of motion and content between these video frames, while augmenting the motion diversity of generated videos, so as to generate better visually continuous videos.
30
+
31
+ To this end, here we propose a dual-stream diffusion net (DSDN) to boost the consistency of content variations in generating videos. To characterize the motion information of video, in particular, we introduce a motion branch to encode video content variations besides the branch of video content. Hereby, we construct a two-branch diffusion network, video motion stream as well as video content stream. To make full of large-scale image generation model, the content stream runs upon a pre-trained text-to-image conditional diffusion model, but meantime is updated incrementally with a parallel network for personalized video content generation. On the parallel step, the variations of video frames, i.e. motion, takes a separate probability diffusion process through employing 3D-UNet, so that personalize motion information could be generated. To align the generated content and motion, we design a dual-stream transformation interaction module by using cross-attention between the two streams. Accordingly, the motion stream is integrated with the content stream during the denoising process, which allows each stream to serve as contextual information for the other. Besides, we also introduce motion decomposer and combiner to facilitate the operation on motion. We conducted qualitative and quantitative experimental verification, and the experiments demonstrate that our method enable to produce better visually continuous videos, as shown in Fig. 1.
32
+
33
+ In the end, we briefly summarize the contributions to the realm of text-to-video generation: i) propose a Dual-Stream Diffusion Net (DSDN) to enhance the consistency and diversity of generated videos, where the motion is specifically modeled as a single branch that distinguishes from most existing video diffusion methods; ii) design some useful modules, including personalized content/motion generation, dual-stream transformation interaction, to align content and motion while preserving the diversity of generated samples; iii) qualitative and quantitative evaluations demonstrate that DSDN could effectively generates videos with remarkable consistency and diversity.
34
+
35
+ ## Related Work
36
+
37
+ The development and evolution of models for converting textual descriptions into visual content have been a consistent focus in the field of artificial intelligence. The research has gradually transitioned from text-to-image models to more dynamic and complex text-to-video generation models.
38
+
39
+ Text-to-Image GenerationEarly efforts were dedicated to developing techniques for text-to-image synthesis. The Denoising Diffusion Probabilistic Model (DDPM) [13, 14, 15] has garnered significant attention owing to its remarkable ability to generate high-quality images. This innovative model has exceeded the performance of previous generative adversarial networks (GANs) [16], setting a new benchmark in the field. Furthermore, the DDPM has a unique feature: it can be trained with text guidance, empowering users to generate images from textual inputs. Several notable advancements have been made in this area. For instance, GLIDE [17] adopts classifier-free guidance and trains the diffusion model using large-scale text-image pairs. DALLE-2 [10] uses CLIP [12] latent space as a condition, which significantly enhances the performance. Imagen [1] employs a T5 [13] coupled with cascaded diffusion models to generate high-resolution images. The Latent Diffusion Model (LDM) [1] proposes forwarding the diffusion process in latent space, demonstrating higher efficiency than other diffusion models.
40
+
41
+ Text-to-Video GenerationDespite these advancements in text-to-image models, transitioning to text-to-video synthesis presented new challenges, mainly due to the temporal dependencies between video frames and the need to maintain motion semantics throughout the video sequence. Early works in this regard include GAN-based methods [15, 16] and auto-regressive one [18, 14]. In the context of unconditional video generation, Ho et al. [14] successfully extended the DDPM models initially designed for images into the video domain, leading to the development of a 3D U-Net architecture. Harvey et al. [16] put forth an innovative approach wherein they modeled the distribution of subsequent video frames in an auto-regressive manner. Our primary focus, however, lies in synthesizing videos in a controllable manner - more specifically, in text-conditional video generation. Exploring this avenue, Hong et al. [14] proposed CogVideo, an autoregressive framework that models the video sequence by conditioning it on the given text and the previous frames. Similarly, Levon et al. [17] proposed the Text2Video-Zero, a text-to-video generation method based on the text-to-image model stable diffusion [15], which can not only directly generate text-to-video, but also directly complete image editing tasks. The current issue in the text-to-video domain is that generative videos often tend to carry some flickers and artifacts. Few attempts made to capture both the visual and dynamic aspects of videos include the latent stream diffusion models proposed by Ni et al. [13], and et al. [14] projected latent video diffusion model for generating long video through the integration of spatial and temporal information flow. These have been success fully used in tasks such as generating high-quality images from textual descriptions [14, 13, 15], while their potential in generating dynamic videos from text remains largely untapped. Our work is inspired by these previous research efforts and seeks to address the pitfalls common in existing models. We introduce a novel dual-stream diffusion net to improve the consistency of content variations in generating videos.
42
+
43
+ ## Method
44
+
45
+ In this section, we first provide an overview on the network, and then illustrate the details therein.
46
+
47
+ OverviewThe proposed DSDN network architecture is shown in Fig. 2. Initially, the input video \(x_{0}^{1:L}\) is projected into a latent space via a frame-wise encoder \(\mathcal{E}\), denoted as \(z_{0}^{1:L}=\mathcal{E}(x_{0}^{1:L})\), where \(L\) is the length of video. Due to only the frame-wise encoding without temporal dynamics, we call \(z_{0}^{1:L}\) as the content features. To mine those temporal cues for better video generation, we introduce a motion de-composer to extract the corresponding motion information, denoted as \(\tilde{z}_{0}^{1:L}\). Taking the both latent features as input, \(z_{0}^{1:L}\) and \(\tilde{z}_{0}^{1:L}\), we use a dual-stream diffusion way to producing personalized video content and motion variation, and subsequently propose a transformed interaction way to integrate the two components to generate a video.
48
+
49
+ At the stage of dual-stream diffusion, two types of latent features are transformed to standard Gaussian priors through separate Forward Diffusion Processes (FDP). Then, the content feature prior undergoes denoising along Personalized Content Generation Stream (PCGS), which would result in pure denoised content features \(z_{0}^{1:L}\). Similarly, the motion feature prior is denoised by Personalized Motion Generation Stream, which would lead to pure denoised motion features \(\tilde{z}_{0}^{1:L}\). To further align the generated content and motion for suppressing flickers, we design a Dual-Stream Transformation Interaction module to bridge the two types of generation streams. After the alignment learning, we use a motion combiner to compensate dynamic information to video content, and finally form the latent feature of the target video, following by a decoder \(\mathcal{D}\) to produce videos in the pixel space.
50
+
51
+ Forward Diffusion ProcessTo reduce resource consumption, we take the similar way to the latent diffusion model [12], underpinned by Denoising Diffusion Probabilistic Models (DDPM) [10]. Before diffusion, we use a pre-trained Vector Quantized Variational AutoEncoder (VQ-VAE) (van den Oord, Vinyals, and Kavukcuoglu 2018) to project video frames into a latent feature space, i.e., \(z_{0}^{1:L}=\mathcal{E}(x_{0}^{1:L})\). For simplification, we frozen the encoder \(\mathcal{E}\) during training. The content features \(z_{0}^{1:L}\) are then processed through a motion decomposer (please see the following part: Motion Decomposition and Combination) to obtain the motion feature \(\tilde{z}_{0}^{1:L}\). The two part of features suffer noise perturbation through a pre-defined Markov process, formally,
52
+
53
+ \[\begin{split} q(z_{t}|z_{t-1})&=\mathcal{N}(z_{t}; \sqrt{1-\beta_{t}}z_{t-1},\beta_{t}I),\\ q^{\prime}(\tilde{z}_{t}|\tilde{z}_{t-1})&=\mathcal{ N}(\tilde{z}_{t};\sqrt{1-\beta_{t}}\tilde{z}_{t-1},\beta_{t}I),\end{split} \tag{1}\]
54
+
55
+ where \(t=1,...,T\) and \(T\) is the number of diffusion steps, \(\beta_{t}\) defines the strength of noise at each iterative step. It is worth noting that the shared noising schedule for the two streams works well in our experience. According to DDPM, the above recursion formula could be derived into the a condensed version,
56
+
57
+ \[\begin{split} z_{t}^{1:L}&=\sqrt{\bar{\alpha_{t}}} z_{0}^{1:L}+\sqrt{1-\bar{\alpha_{t}}}\epsilon_{1},\epsilon_{1}\sim\mathcal{N}(0,I), \\ \tilde{z}_{t}^{1:L}&=\sqrt{\bar{\alpha_{t}}}\tilde{z}_ {0}^{1:L}+\sqrt{1-\bar{\alpha_{t}}}\epsilon_{2},\epsilon_{2}\sim\mathcal{N}(0,I ),\end{split} \tag{2}\]
58
+
59
+ where \(\bar{\alpha_{t}}=\prod_{i=1}^{t}\alpha_{t},\alpha_{t}=1-\beta_{t}\). Until now, we have successfully completed the forward diffusion process for both content and motion features. This provides us with the priors \(z_{T}^{1:L}\) and \(\tilde{z}_{T}^{1:L}\) which are instrumental in driving the ensuing denoising process.
60
+
61
+ Personalized Content Generation StreamIn order to take advantage of well-trained image-based diffusion model, we leverage the large-scale text-to-image model, Stable Diffusion [12], as the fundamental model of video content generation. But to better support personalized video content generation, we design an incremental learning module to refine content generation by using a similar way to LoRA [13]. As shown in Fig. 2, we refer to them as the content basic unit and the content increment unit, respectively. The model parameters of the two units are adaptively integrated to boost content features. Such a way not only inherits the merit of large-scale image-based generation model but also endows the creation of unique and personalized content, contributing to the overall improvement of our method.
62
+
63
+ Concretely, the content basic unit uses a modified U-Net architecture, where each resolution level incorporates 2D convolution layers with self-attention and cross-attention mechanisms. Concurrently, the content increment unit employs an extra network branch with a few tunable parameters for fine tuning. Suppose the basic unit is with parameters \(W\), we have the post-tuned weight: \(W^{\prime}=W+\lambda\Delta W\), where \(\Delta W\) is the update quantity and \(\lambda\) is the step length. The hyper-parameter \(\lambda\) dictates the influence exerted by the tuning process, thereby offering users extensive control over the generation outcome. To counter potential over-fitting and reduce computational overhead, \(\Delta W\in\mathbb{R}^{m\times n}\) is decomposed into two low-rank matrices, as used in LoRA [13]. Let's denote \(\Delta W=AB^{T}\), where \(A\in\mathbb{R}^{m\times r}\), \(B\in\mathbb{R}^{n\times r}\), and \(r\ll m,n\).
64
+
65
+ To improve the smoothness of generated content frames, we also generate video content on the condition of motion information, which refers to cross-transformer introduced in the part: Dual-Stream Transformation Interaction. Formally, we give the optimized objective on content information,
66
+
67
+ \[\mathcal{L}_{con}=\mathbb{E}_{\bar{z},y,t}[\big{\|}\epsilon-\epsilon_{\theta} (z_{t}^{1:L},t\mid c(y),\tilde{z}_{t}^{1:L})\big{\|}_{2}^{2}], \tag{3}\]
68
+
69
+ where \(y\) is the corresponding textual description, \(\epsilon_{\theta}(\cdot)\) here represents the part of the personalized content generation stream with the network parameter \(\theta\). Note that we employ the text encoder of CLIP [1] to perform the text feature extraction \(c(\cdot)\).
70
+
71
+ Personalized Motion Generation StreamIn the personalized motion generation stream, we employ a 3D U-Net based diffusion model to generate a motion-coherent latent features, wherein the network architecture of 3D U-Net is similar to that in [10]. The reason why use 3D U-Net is that the global motion variation of the entire input video could be captured for subsequent motion generation. Given an input sequence of motion priors \(\widetilde{z}_{T}^{1:L}\), we can obtain a transformed representation vector after using the encoding stage of 3D U-Net, and the next denosing process takes the vector as input for diffusion, similar to DDPM. Differently, to make the generated motion matched with content, we use the generated content feature \(z_{t}^{1:L}\) as well as the text prompt \(c(y)\), as the conditions, in the denoising diffusion process. The use of content condition refers to cross-transformer, which will be introduced in the next part: Dual-Stream Transformation Interaction. Hereby, the training objective of the personalized motion generation stream can be formulated as:
72
+
73
+ \[\mathcal{L}_{mot}=\mathbb{E}_{z,y,t}[\big{\|}\epsilon-\epsilon_{\hat{\theta}}( \tilde{z}_{t}^{1:L},t\mid c(y),z_{t}^{1:L})\big{\|}_{2}^{2}], \tag{4}\]
74
+
75
+ where \(\epsilon_{\hat{\theta}(.)}\) represents the part of the personalized motion generation stream with the network parameter \(\tilde{\theta}\).
76
+
77
+ Dual-Stream Transformation InteractionTo well align the generated content and motion information, we design a cross-transformer interaction way between the two denoising streams. On the one hand, we infuse the denoising generation procedure of the motion stream with conditional feature information from the content, by taking the transformer from content to motion, which would enhance the continuity of the overall motion. On the other hand, the denoising generation process of the content stream also absorbs the conditional feature information from the motion, by taking the transformer from motion to content. This cross-transformer based streams render the overall content smoother, creating a synergistic effect that enhances the consistency and quality of the final output. In detail, after each convolutional layer of U-Net, we interpose a cross-attention layer to integrate the latent features of both content stream and motion stream. Taking the case from motion to content as an example, formally, we have
78
+
79
+ \[z_{con}\text{=Att}(Q_{mot},K_{con},V_{con})\text{=Softmax}(\frac{Q_{mot}K_{con}^ {T}}{\sqrt{d}})\cdot V_{con}, \tag{5}\]
80
+
81
+ where \(Q_{mot}=W^{Q}z_{mot},K_{con}=W^{K}z_{con},\) and \(V_{con}=W^{V}z_{con}\) denote three projections of cross-attention along the content stream with the parameters \(W^{Q},W^{K},W^{V}\), and \(d\) is the feature dimensionality. The motion stream features can constrain the content stream generated to ensure smoother transitions from frame to frame. Similarly, the same principle applies for the motion stream as well. At this time, the content stream features can supply an understanding of image apparent information for the generation of motion latent features during the denoising process. Hence, the cross-attention layer in this context facilitates mutual conditioning between the dual-stream features.
82
+
83
+ Intuitively, such a cross-transformer strategy is explicitly
84
+
85
+ Figure 2: DSDN network framework. Initially, Content and motion features are added to noise during the diffusion process, followed by a denoising step via the dual-stream diffusion net. Lastly, the latent space features of the generated video are obtained through the motion combiner and decoded to render the final generated video.
86
+
87
+ performed on two branches of diffusion processes. This is very different from those previous video diffusion methods [1, 13, 14], which essentially use a single-stream diffusion process by either directly inserting a pseudo 3D layer to manage temporal information, or intercalating a 3D layer between two successive 2D convolution layers. Besides, the dual-stream diffusion network also take the corresponding textual conditional embedding as input.
88
+
89
+ In the end, the total optimization objective of dual-stream diffusion net is the joint in both Eq. 3 and Eq. 4. Throughout the optimization process, only the content increment unit (in the content stream) and the cross-attention layer (between two denoising streams) are trainable, whilst the content basic unit, i.e., the underlying text-to-image model in the content stream, remains static in order to preserve the consistency of its feature space. An illustration is shown in Fig. 3.
90
+
91
+ Motion Decomposition and CombinationIn order to separate motion features and reduce computational cost, we design the lightweight motion decomposer and corresponding motion combiner inspired by the work [1]. In terms of the motion decomposer, given the input content feature \(z_{0}^{l:L}\in\mathbb{R}^{B\times L\times C\times H\times W}\), the motion decomposer first utilizes a \(1\times 1\) convolution layer to reduce the channel (\(C\)) by a factor of \(r\), which would alleviate computing expense. We then compute motion features derived from every pair of sequential frames. For instance, given transformed \(z_{0}^{l}\) and \(z_{0}^{l+1}\), we initially apply a 2D channel-wise convolution to \(z_{0}^{l+1}\). Subsequently, we subtract this new value from \(z_{0}^{l}\) to obtain the \((l)\)-th and \((l+1)\)-th frame motion representation \(\bar{z}_{0}^{l}\), formally,
92
+
93
+ \[\bar{z}_{0}^{l}=conv(z_{0}^{l+1})-z_{0}^{l}, \tag{6}\]
94
+
95
+ where \(conv(\cdot)\) denotes a 2D channel-wise convolution. As depicted in Fig. 4(a), we apply the motion decomposer on every two adjacent frames. As a result, the motion decomposer generates \(L-1\) frames of motion features. To ensure compatibility with the original temporal length, we append the last frame of video to reach the length \(L\) in our experiment. Finally, another \(1\times 1\) 2D convolution layer is utilized to restore the number of channels back to \(C\).
96
+
97
+ For the motion combiner, given the denoised content and motion features \(z^{1:L}\) and \(\bar{z}^{1:L}\), we also first employ \(1\times 1\) convolution to reduce the channel number. As shown in Fig. 4(b), the content feature and their adjacent motion features are fused after doing a 2D convolution on motion features. Formally, the \(l\)-th frame latent feature \(\hat{z}_{0}^{l}\) of the generated video is defined as,
98
+
99
+ \[\hat{z}_{0}^{l}=conv(\bar{z}_{0}^{l-1})+z_{0}^{l}+conv(\bar{z}_{0}^{l}), \tag{7}\]
100
+
101
+ where \(conv\) represents the 2D channel-wise convolution. Upon acquiring the combined video latent features, it comes back to the original channel dimension via a \(1\times 1\) convolutional layer. This combined features are then input into the final decoder, which yields a video in the pixel space.
102
+
103
+ ## Experiments
104
+
105
+ ### Implementation Details
106
+
107
+ In our experimental setup, we generate \(L=16\) frames with a resolution \(512\) for each video. We trained the Dual-Stream Diffusion Net using a subset (comprising 5M videos) from the WebVid-10M [1] and HD-VILA-100M [20] datasets. Video clips within the dataset are first sampled at a stride of 4, then resized and centrally cropped to a resolution of \(256\times 256\). For the content basic unit, we employ stable diffusion v1.5 pre-trained weights which remain frozen throughout the training procedure. The content incremental unit does not commence training from scratch, but rather utilizes an existing model [15] as pre-trained weights, followed by fine-tuning on our training data. For the motion unit, we initialize our Personalized Motion Generation Stream with the weights of LDM [15], which were pre-trained on Laion-5B [15]. During inference, it takes approximately 35 seconds to sample a single video using one NVIDIA RTX 4090 GPU.
108
+
109
+ Figure 4: Details of Motion Decomposer and Motion Combiner.
110
+
111
+ Figure 3: Dual-stream transformation block.
112
+
113
+ ### Comparison with Baselines
114
+
115
+ We compare our method with two publicly available baseline: 1) CogVideo [19]: a Text-to-Video model trained on a dataset of 5.4 million captioned videos, and is capable of generating videos directly from text prompts in a zero-shot manner. 2) Text2Video-Zero [17]: also a Text-to-Video generation method based on the Stable Diffusion model. Since our method is a text-to-video method we compare with Text2Video-Zero in pure text-guided video synthesis settings. Owing to space constraints, we present a quantitative comparison of our method with the two aforementioned approaches. However, for qualitative results, we limit our comparative analysis to the superior performing text2video-zero method for a more focused and meaningful evaluation.
116
+
117
+ Quantitative ComparisonWe evaluate our method in relation to baseline models by employing automated metrics, detailing frame consistency and textual alignment outcomes in the accompanying Table. 1. For assessing frame consistency, we compute CLIP [13] image embeddings on all frames within the output videos, reporting the average cosine similarity across all pairings of video frames. To evaluate textual alignment, we calculate the average CLIP score between all frames in the output videos and their corresponding prompts. Our findings reveal that the videos generated via our method surpass publicly accessible alternatives such as CogVideo [19] and Text2Video-Zero [17], particularly with regard to frame consistency and text alignment. This suggests that our method offers a more robust and coherent approach to video generation from textual prompts.
118
+
119
+ Qualitative ComparisonWhen compared to text2video-zero, our method demonstrates superior consistency in both content and motion across generated videos, as shown in Fig. 5. As illustrated in the first row, observable body swings are evident as the panda walks, while in the second row, we see the limbs swing as the person runs, accompanied by gradual changes in the background. In the third row, the lower limbs of the horse are seen swinging as it gallops, set against a dynamic background. Furthermore, our method outperforms the other approach in terms of content quality and its conformity with the text. For instance, the generated pandas in our model appear more realistically rendered, the snow in the second row exhibits imprints, and the street in the third row is more logically constructed. We further include a fourth row as a comparative example in a less favourable environment - rain conditions. Here, the content generated by the other method appears unrealistic, whereas our method not only captures the essence of a rainy day more effectively but also establishes the logical connection between rain and umbrellas, thereby enhancing the realism and context-appropriateness of generated videos.
120
+
121
+ Beyond this, we further show the qualitative results of our method on video diversity generation, as shown in Fig. 6. Using "a cat is walking on the grass" as the text input, we can see various actions such as a cat walking forward, left, and right. Impressively, the generated videos also exhibit diversity in aspects like fur color, body shape, and pose, thereby encompassing a rich variety in content. Concurrently, the generated video preserves high continuity. As demonstrated in the first row, the flower at the lower right gradually comes into view, while in the second row, subtle changes in the cat's shadow can be discerned. In the third row, a figure in the background is progressively moving, further enhancing the sense of dynamic realism. Furthermore, we acknowledge a minor failure case depicted at the end of the third row, where the color of the cat appears slightly altered. This issue primarily stems from the generated background inadvertently influencing the content creation process itself. However, as evident from the comprehensive results showcased, such instances are rare, thereby attesting to the robustness and reliability of our proposed method in text-to-video generation.
122
+
123
+ \begin{table}
124
+ \begin{tabular}{l c c} \hline \hline Methods & Frame Consistency & Textual Alignment \\ \hline CogVideo & 88.32 & 22.02 \\ Text2Video-Zero & 90.21 & 29.56 \\ Ours & **92.13** & **32.23** \\ \hline \hline \end{tabular}
125
+ \end{table}
126
+ Table 1: Comparison of CLIP score metric with baselines.
127
+
128
+ Figure 5: Qualitative comparison between Text2Video-Zero [17] (frames 1-4 in each row) and our method (frames 5-8 in each row). Please see the videos in the website.
129
+
130
+ Figure 6: The diversity of our method qualitative results. Prompt: a cat is walking on the grass.
131
+
132
+ ### Ablation Study
133
+
134
+ We conduct a rigorous ablation study to evaluate the significance of both the content increment unit and the motion unit, as depicted in Fig 7. Each design component is selectively ablated to determine its individual impact on the model's overall performance.
135
+
136
+ #### Motion Unit
137
+
138
+ The outcomes from the first row indicate that while a model void of the motion unit can still synthesize apparent content in accordance with text conditions, it fails to maintain the continuity between video frames. This result stems from the absence of temporal dimension modeling--resulting in generated video frames being independently constrained by text conditions without inter-frame connection. In terms of content congruity, the generated video frames exhibit a solid alignment with the narrative conveyed by the textual conditions. For instance, elements like 'dancing', 'leaves', and 'curly hair' described in the text are accurately manifested within the generated imagery.
139
+
140
+ #### Incremental Unit
141
+
142
+ As observed in the second row, the visible content quality suffers a significant reduction without the incremental unit model. This underscores the pivotal role of the content increment unit in learning richer visual content beyond what the content base unit alone can achieve. Upon analyzing the results from the first three rows, we observe a few issues: 1) Fine-tuning the incremental unit seems to stabilize the apparent content; for instance, the girls in both the first and third rows face forward, whereas without the incremental unit, as seen in the second row, the girl's perspective can emerge from any direction. 2)The clothing color in the first and third rows leans towards green, mirroring the hue of the background environment. These challenges might arise due to limited parameter volume within the incremental unit, thereby restricting the scope of apparent content it can learn effectively. Such observations underscore areas for further exploration and improvement in the incremental unit of our method.
143
+
144
+ #### Motion Unit Visualization
145
+
146
+ Furthermore, we offer a detailed visualization of the motion unit's effects of the third row in the last row. The visualizations highlight the efficacy of the motion unit in accurately capturing inter-frame motion details such as arm swings, body movements, and hair fluttering, thereby underscoring its critical role in achieving a coherent and dynamic video output.
147
+
148
+ ## Conclusion
149
+
150
+ This work presented a novel dual-stream diffusion net (DSDN) to improve the consistency of content variations in generating videos. Specifically, the designed two diffusion streams, video content and motion branches, could not only run separately in their private spaces for producing personalized video variations as well as content, but also be well-aligned between the content and motion domains through leveraging our designed cross-transformer interaction module, which would benefit the smoothness of generated videos and enhance the consistency and diversity of generated frames, where the motion is specifically modeled as a single branch that distinguishes from most existing video diffusion methods. Besides, we also introduced motion decomposer and combiner to facilitate the operation on video motion. Qualitative and quantitative experiments demonstrated that our method produces better continuous videos with fewer flickers.
151
+
152
+ Figure 7: Ablation study. Prompt: a girl is dancing among leaves, curly hair.
153
+
154
+ ## References
155
+
156
+ * M. Bain, A. Nagrani, G. Varol, and A. Zisserman (2021)Frozen in time: a joint video and image encoder for end-to-end retrieval. Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV). Cited by: SS1, SS2.
157
+ * A. Blattmann, R. Rombach, H. Ling, T. Dockhorn, S. W. Kim, S. Fidler, and K. Kreis (2023)Align your latents: high-resolution video synthesis with latent diffusion models. Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 22563-22575. Cited by: SS1, SS2.
158
+ * A. Civitai (2022)Civitai. [https://civitai.com/](https://civitai.com/). Cited by: SS1, SS2.
159
+ * A. Clark, J. Donahue, and K. Simonyan (2019)Adversarial video generation on complex datasets. arXiv preprint arXiv:1907.06571. Cited by: SS1, SS2.
160
+ * I. Goodfellow, J. Pouget-Abadie, M. Mirza, B. Xu, D. Warde-Farley, S. Ozair, A. Courville, and Y. Bengio (2020)Generative adversarial networks. Communications of the ACM. Cited by: SS1, SS2.
161
+ * Y. Guo, C. Yang, A. Rao, Y. Wang, Y. Qiao, D. Lin, and B. Dai (2023)AnimateDiff: animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725. Cited by: SS1, SS2.
162
+ * W. Harvey, S. Naderiparizi, V. Masrani, C. Weilbach, and F. Wood (2022)Flexible diffusion modeling of long videos. arXiv preprint arXiv:2205.11495. Cited by: SS1, SS2.
163
+ * J. Ho, A. Jain, and P. Abbeel (2020)Denoising diffusion probabilistic models. Neural Information Processing Systems (NeurIPS). Cited by: SS1, SS2.
164
+ * J. Ho, T. Salimans, A. Gritsenko, W. Chan, M. Norouzi, and D. J. Fleet (2022)Video diffusion models. arXiv preprint arXiv:2204.03458. Cited by: SS1, SS2.
165
+ * W. Hong, M. Ding, W. Zheng, X. Liu, and J. Tang (2022)CogVideo: large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:2205.15868. Cited by: SS1, SS2.
166
+ * E. J. Hu, Y. Shen, P. Wallis, Z. Allen-Zhu, Y. Li, S. Wang, L. Wang, and W. Chen (2021)LoRA: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685. Cited by: SS1, SS2.
167
+ * B. Jiang, M. Wang, W. Gan, W. Wu, and J. Yan (2019)STM: spatio-temporal and motion encoding for action recognition. Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp. 2000-2009. Cited by: SS1, SS2.
168
+ * N. Kalchbrenner, A. Oord, K. Simonyan, I. Danihelka, O. Vinyals, A. Graves, and K. Kavukcuoglu (2017)Video pixel networks. In Proceedings of the 34th International Conference on Machine Learning, pp. 1771-1779. Cited by: SS1, SS2.
169
+ * L. Khachatryan, A. Movsisyan, V. Tadevosyan, R. Henschel, Z. Wang, S. Navasardyan, and H. Shi (2023)Text2Video-zero: text-to-image diffusion models are zero-shot video generators. Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV). Cited by: SS1, SS2.
170
+ * H. Ni, C. Shi, K. Li, S. X. Huang, and M. R. Min (2023)Conditional image-to-video generation with latent flow diffusion models. Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 18444-18455. Cited by: SS1, SS2.
171
+ * A. Nichol, P. Dhariwal, A. Ramesh, P. Shyam, P. Mishkin, B. McGrew, I. Sutskever, and M. Chen (2021)Glide: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741. Cited by: SS1, SS2.
172
+ * A. Radford, J. W. Kim, C. Hallacy, A. Ramesh, G. Goh, and S. Agarwal (2021)Learning transferable visual models from natural language supervision. Proceedings of the 38th International Conference on Machine Learning and PMLR8748-8763. Cited by: SS1, SS2.
173
+ * C. Raffel, N. Shazeer, A. Roberts, K. Lee, and S. Narang (2020)Exploring the limits of transfer learning with a unified text-to-text transformer. The Journal of Machine Learning Research5485-5551. Cited by: SS1, SS2.
174
+ * A. Ramesh, P. Dhariwal, A. Nichol, C. Chu, and M. Chen (2022)Hierarchical text-conditional image generation with cllp latents. arXiv preprint arXiv:2204.06125. Cited by: SS1, SS2.
175
+ * R. Rombach, A. Blattmann, D. Lorenz, and B. Esser (2022)High-resolution image synthesis with latent diffusion models. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684-10695. Cited by: SS1, SS2.
176
+ * C. Saharia, W. Chan, S. Saxena, L. Li, J. Whang, E. L. Denton, and K. Ghasemipour (2022)Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems36479-36494. Cited by: SS1, SS2.
177
+ * C. Schuhmann, R. Vencu, R. Beaumont, T. Coombes, C. Gordon, A. Katta, R. Kaczmarczyk, and J. Jitsev (2022)LAION-5B: a new era of open large-scale multimodal datasets. arXiv preprint arXiv:2307.04725. Cited by: SS1, SS2.
178
+ * U. Singer, A. Polyak, T. Hayes, X. Yin, J. An, S. Zhang, Q. Hu, H. Yang, O. Ashual, and O. Gafni (2022)Make-a-a-video: text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792. Cited by: SS1, SS2.
179
+ * L. Smaira, J. Carreira, E. Noland, E. Clancy, A. Wu, and A. Zisserman (2020)A short note on the kinetics-700-2020 human action dataset. arXiv:2010.10864. Cited by: SS1, SS2.
180
+ * J. Song and S. Meng (2022)Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502. Cited by: SS1, SS2.
181
+ * A. van den Oord, O. Vinyals, and K. Kavukcuoglu (2018)Neural discrete representation learning. Advances in Neural Information Processing Systems. Cited by: SS1, SS2.
182
+ * C. Vondrick, H. Pirsiavash, and A. Torralba (2016)Generating videos with scene dynamics. arXiv preprint arXiv:1609.02612. Cited by: SS1, SS2.
183
+ * H. Xue, T. Hang, Y. Zeng, Y. Sun, B. Liu, H. Yang, J. Fu, and B. Guo (2022)Advancing high-resolution videoelan-gauge representation with large-scale video transcriptions. Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5036-5045. Cited by: SS1, SS2.
184
+ * S. Yu, K. Sohn, S. Kim, and J. Shin (2023)Video probabilistic diffusion models in projected latent space. Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 18456-18466. Cited by: SS1, SS2.