THUDM
/

CogVideoX1.5-5B

@@ -127,14 +127,14 @@ pip install --upgrade transformers accelerate diffusers imageio-ffmpeg
 2. Run the code
-```python
 import torch
-from diffusers import CogVideoXImageToVideoPipeline
-from diffusers.utils import export_to_video, load_image
-prompt = "A little girl is riding a bicycle at high speed. Focused, detailed, realistic."
-image = load_image(image="input.jpg")
-pipe = CogVideoXImageToVideoPipeline.from_pretrained(
     "THUDM/CogVideoX1.5-5B",
     torch_dtype=torch.bfloat16
 )
@@ -145,7 +145,6 @@ pipe.vae.enable_slicing()
 video = pipe(
     prompt=prompt,
-    image=image,
     num_videos_per_prompt=1,
     num_inference_steps=50,
     num_frames=81,
@@ -169,7 +168,7 @@ with `torch.compile`, which can significantly accelerate inference.
 import torch
 from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXImageToVideoPipeline
-from diffusers.utils import export_to_video, load_image
 from transformers import T5EncoderModel
 from torchao.quantization import quantize_, int8_weight_only
@@ -200,10 +199,8 @@ pipe.vae.enable_tiling()
 pipe.vae.enable_slicing()
 prompt = "A little girl is riding a bicycle at high speed. Focused, detailed, realistic."
-image = load_image(image="input.jpg")
 video = pipe(
     prompt=prompt,
-    image=image,
     num_videos_per_prompt=1,
     num_inference_steps=50,
     num_frames=81,

 2. Run the code
+```python
 import torch
+from diffusers import CogVideoXPipeline
+from diffusers.utils import export_to_video
+prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
+pipe = CogVideoXPipeline.from_pretrained(
     "THUDM/CogVideoX1.5-5B",
     torch_dtype=torch.bfloat16
 )
 video = pipe(
     prompt=prompt,
     num_videos_per_prompt=1,
     num_inference_steps=50,
     num_frames=81,
 import torch
 from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXImageToVideoPipeline
+from diffusers.utils import export_to_video
 from transformers import T5EncoderModel
 from torchao.quantization import quantize_, int8_weight_only
 pipe.vae.enable_slicing()
 prompt = "A little girl is riding a bicycle at high speed. Focused, detailed, realistic."
 video = pipe(
     prompt=prompt,
     num_videos_per_prompt=1,
     num_inference_steps=50,
     num_frames=81,

README_zh.md CHANGED Viewed

@@ -109,12 +109,12 @@ pip install --upgrade transformers accelerate diffusers imageio-ffmpeg
 ```python
 import torch
-from diffusers import CogVideoXImageToVideoPipeline
-from diffusers.utils import export_to_video, load_image
-prompt = "A little girl is riding a bicycle at high speed. Focused, detailed, realistic."
-image = load_image(image="input.jpg")
-pipe = CogVideoXImageToVideoPipeline.from_pretrained(
     "THUDM/CogVideoX1.5-5B",
     torch_dtype=torch.bfloat16
 )
@@ -125,7 +125,6 @@ pipe.vae.enable_slicing()
 video = pipe(
     prompt=prompt,
-    image=image,
     num_videos_per_prompt=1,
     num_inference_steps=50,
     num_frames=81,
@@ -148,7 +147,7 @@ GPU 上运行该模型成为可能！值得注意的是，TorchAO 量化与 `tor
 import torch
 from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXImageToVideoPipeline
-from diffusers.utils import export_to_video, load_image
 from transformers import T5EncoderModel
 from torchao.quantization import quantize_, int8_weight_only
@@ -177,10 +176,8 @@ pipe.vae.enable_tiling()
 pipe.vae.enable_slicing()
 prompt = "A little girl is riding a bicycle at high speed. Focused, detailed, realistic."
-image = load_image(image="input.jpg")
 video = pipe(
     prompt=prompt,
-    image=image,
     num_videos_per_prompt=1,
     num_inference_steps=50,
     num_frames=81,

 ```python
 import torch
+from diffusers import CogVideoXPipeline
+from diffusers.utils import export_to_video
+prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
+pipe = CogVideoXPipeline.from_pretrained(
     "THUDM/CogVideoX1.5-5B",
     torch_dtype=torch.bfloat16
 )
 video = pipe(
     prompt=prompt,
     num_videos_per_prompt=1,
     num_inference_steps=50,
     num_frames=81,
 import torch
 from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXImageToVideoPipeline
+from diffusers.utils import export_to_video
 from transformers import T5EncoderModel
 from torchao.quantization import quantize_, int8_weight_only
 pipe.vae.enable_slicing()
 prompt = "A little girl is riding a bicycle at high speed. Focused, detailed, realistic."
 video = pipe(
     prompt=prompt,
     num_videos_per_prompt=1,
     num_inference_steps=50,
     num_frames=81,