Haoxin Chen commited on
Commit
4949f04
1 Parent(s): d98ccdc

update videocrafter2

Browse files
Files changed (3) hide show
  1. app.py +2 -37
  2. configs/inference_t2v_512_v2.0.yaml +77 -0
  3. t2v_test.py +8 -8
app.py CHANGED
@@ -3,7 +3,6 @@ import sys
3
  import gradio as gr
4
  # from demo_test import Text2Video, Image2Video
5
  from t2v_test import Text2Video
6
- from i2v_test import Image2Video
7
  sys.path.insert(1, os.path.join(sys.path[0], 'lvdm'))
8
 
9
  t2v_examples = [
@@ -15,18 +14,14 @@ t2v_examples = [
15
  ['Robot dancing in times square',25,12,1,16],
16
  ]
17
 
18
- i2v_examples = [
19
- ['prompts/i2v_prompts/horse.png', 'horses are walking on the grassland', 50, 12, 1, 16]
20
- ]
21
 
22
  def videocrafter_demo(result_dir='./tmp/'):
23
  text2video = Text2Video(result_dir)
24
- image2video = Image2Video(result_dir)
25
  with gr.Blocks(analytics_enabled=False) as videocrafter_iface:
26
- gr.Markdown("<div align='center'> <h2> VideoCrafter1: Open Diffusion Models for High-Quality Video Generation </span> </h2> \
27
  <a style='font-size:18px;color: #000000' href='https://github.com/AILab-CVC/VideoCrafter'> Github </div>")
28
 
29
- gr.Markdown("<b> You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. <a style='display:inline-block' href='https://huggingface.co/spaces/VideoCrafter/VideoCrafter?duplicate=true'> <img src='https://bit.ly/3gLdBN6' alt='Duplicate Space'></a> </b>")
30
  #######t2v#######
31
  with gr.Tab(label="Text2Video"):
32
  with gr.Column():
@@ -54,36 +49,6 @@ def videocrafter_demo(result_dir='./tmp/'):
54
  inputs=[input_text,steps,cfg_scale,eta,fps],
55
  outputs=[output_video_1],
56
  )
57
- #######image2video######
58
- with gr.Tab(label='Image2Video'):
59
- with gr.Column():
60
- with gr.Row():
61
- with gr.Column():
62
- with gr.Row():
63
- i2v_input_image = gr.Image(label="Input Image").style(width=256)
64
- with gr.Row():
65
- i2v_input_text = gr.Text(label='Prompts')
66
- with gr.Row():
67
- i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
68
- i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=30.0, step=0.5, label='CFG Scale', value=12.0, elem_id="i2v_cfg_scale")
69
- with gr.Row():
70
- i2v_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
71
- i2v_fps = gr.Slider(minimum=4, maximum=32, step=1, elem_id="i2v_fps", label="Generative fps", value=16)
72
- i2v_end_btn = gr.Button("Send")
73
- with gr.Tab(label='Result'):
74
- with gr.Row():
75
- i2v_output_video = gr.Video(label="Generated Video").style(width=512)
76
-
77
- gr.Examples(examples=i2v_examples,
78
- inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_fps],
79
- outputs=[i2v_output_video],
80
- fn = image2video.get_image,
81
- cache_examples=os.getenv('SYSTEM') == 'spaces',
82
- )
83
- i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_fps],
84
- outputs=[i2v_output_video],
85
- fn = image2video.get_image
86
- )
87
 
88
  return videocrafter_iface
89
 
 
3
  import gradio as gr
4
  # from demo_test import Text2Video, Image2Video
5
  from t2v_test import Text2Video
 
6
  sys.path.insert(1, os.path.join(sys.path[0], 'lvdm'))
7
 
8
  t2v_examples = [
 
14
  ['Robot dancing in times square',25,12,1,16],
15
  ]
16
 
 
 
 
17
 
18
  def videocrafter_demo(result_dir='./tmp/'):
19
  text2video = Text2Video(result_dir)
 
20
  with gr.Blocks(analytics_enabled=False) as videocrafter_iface:
21
+ gr.Markdown("<div align='center'> <h2> VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models </span> </h2> \
22
  <a style='font-size:18px;color: #000000' href='https://github.com/AILab-CVC/VideoCrafter'> Github </div>")
23
 
24
+ # gr.Markdown("<b> You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. <a style='display:inline-block' href='https://huggingface.co/spaces/VideoCrafter/VideoCrafter?duplicate=true'> <img src='https://bit.ly/3gLdBN6' alt='Duplicate Space'></a> </b>")
25
  #######t2v#######
26
  with gr.Tab(label="Text2Video"):
27
  with gr.Column():
 
49
  inputs=[input_text,steps,cfg_scale,eta,fps],
50
  outputs=[output_video_1],
51
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  return videocrafter_iface
54
 
configs/inference_t2v_512_v2.0.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: lvdm.models.ddpm3d.LatentDiffusion
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.012
6
+ num_timesteps_cond: 1
7
+ timesteps: 1000
8
+ first_stage_key: video
9
+ cond_stage_key: caption
10
+ cond_stage_trainable: false
11
+ conditioning_key: crossattn
12
+ image_size:
13
+ - 40
14
+ - 64
15
+ channels: 4
16
+ scale_by_std: false
17
+ scale_factor: 0.18215
18
+ use_ema: false
19
+ uncond_type: empty_seq
20
+ use_scale: true
21
+ scale_b: 0.7
22
+ unet_config:
23
+ target: lvdm.modules.networks.openaimodel3d.UNetModel
24
+ params:
25
+ in_channels: 4
26
+ out_channels: 4
27
+ model_channels: 320
28
+ attention_resolutions:
29
+ - 4
30
+ - 2
31
+ - 1
32
+ num_res_blocks: 2
33
+ channel_mult:
34
+ - 1
35
+ - 2
36
+ - 4
37
+ - 4
38
+ num_head_channels: 64
39
+ transformer_depth: 1
40
+ context_dim: 1024
41
+ use_linear: true
42
+ use_checkpoint: true
43
+ temporal_conv: true
44
+ temporal_attention: true
45
+ temporal_selfatt_only: true
46
+ use_relative_position: false
47
+ use_causal_attention: false
48
+ temporal_length: 16
49
+ addition_attention: true
50
+ fps_cond: true
51
+ first_stage_config:
52
+ target: lvdm.models.autoencoder.AutoencoderKL
53
+ params:
54
+ embed_dim: 4
55
+ monitor: val/rec_loss
56
+ ddconfig:
57
+ double_z: true
58
+ z_channels: 4
59
+ resolution: 512
60
+ in_channels: 3
61
+ out_ch: 3
62
+ ch: 128
63
+ ch_mult:
64
+ - 1
65
+ - 2
66
+ - 4
67
+ - 4
68
+ num_res_blocks: 2
69
+ attn_resolutions: []
70
+ dropout: 0.0
71
+ lossconfig:
72
+ target: torch.nn.Identity
73
+ cond_stage_config:
74
+ target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
75
+ params:
76
+ freeze: true
77
+ layer: penultimate
t2v_test.py CHANGED
@@ -12,8 +12,8 @@ class Text2Video():
12
  self.result_dir = result_dir
13
  if not os.path.exists(self.result_dir):
14
  os.mkdir(self.result_dir)
15
- ckpt_path='checkpoints/base_1024_v1/model.ckpt'
16
- config_file='configs/inference_t2v_1024_v1.0.yaml'
17
  config = OmegaConf.load(config_file)
18
  model_config = config.pop("model", OmegaConf.create())
19
  model_config['params']['unet_config']['params']['use_checkpoint']=False
@@ -40,7 +40,7 @@ class Text2Video():
40
  batch_size=1
41
  channels = model.model.diffusion_model.in_channels
42
  frames = model.temporal_length
43
- h, w = 576 // 8, 1024 // 8
44
  noise_shape = [batch_size, channels, frames, h, w]
45
 
46
  #prompts = batch_size * [""]
@@ -61,15 +61,15 @@ class Text2Video():
61
  return os.path.join(self.result_dir, f"{prompt_str}.mp4")
62
 
63
  def download_model(self):
64
- REPO_ID = 'VideoCrafter/Text2Video-1024'
65
  filename_list = ['model.ckpt']
66
- if not os.path.exists('./checkpoints/base_1024_v1/'):
67
- os.makedirs('./checkpoints/base_1024_v1/')
68
  for filename in filename_list:
69
- local_file = os.path.join('./checkpoints/base_1024_v1/', filename)
70
 
71
  if not os.path.exists(local_file):
72
- hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/base_1024_v1/', local_dir_use_symlinks=False)
73
 
74
 
75
  if __name__ == '__main__':
 
12
  self.result_dir = result_dir
13
  if not os.path.exists(self.result_dir):
14
  os.mkdir(self.result_dir)
15
+ ckpt_path='checkpoints/base_512_v2/model.ckpt'
16
+ config_file='configs/inference_t2v_512_v2.0.yaml'
17
  config = OmegaConf.load(config_file)
18
  model_config = config.pop("model", OmegaConf.create())
19
  model_config['params']['unet_config']['params']['use_checkpoint']=False
 
40
  batch_size=1
41
  channels = model.model.diffusion_model.in_channels
42
  frames = model.temporal_length
43
+ h, w = 320 // 8, 512 // 8
44
  noise_shape = [batch_size, channels, frames, h, w]
45
 
46
  #prompts = batch_size * [""]
 
61
  return os.path.join(self.result_dir, f"{prompt_str}.mp4")
62
 
63
  def download_model(self):
64
+ REPO_ID = 'VideoCrafter/VideoCrafter2'
65
  filename_list = ['model.ckpt']
66
+ if not os.path.exists('./checkpoints/base_512_v2/'):
67
+ os.makedirs('./checkpoints/base_512_v2/')
68
  for filename in filename_list:
69
+ local_file = os.path.join('./checkpoints/base_512_v2/', filename)
70
 
71
  if not os.path.exists(local_file):
72
+ hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/base_512_v2/', local_dir_use_symlinks=False)
73
 
74
 
75
  if __name__ == '__main__':