Spaces:
Sleeping
Sleeping
## Requirements | |
Please follow [README](../README.md) to install the environment. After installation, update the version of `diffusers` at leaset to 0.30.0. | |
## Inference | |
```bash | |
from diffusers import LattePipeline | |
from diffusers.models import AutoencoderKLTemporalDecoder | |
from torchvision.utils import save_image | |
import torch | |
import imageio | |
torch.manual_seed(0) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
video_length = 1 # 1 or 16 | |
pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16).to(device) | |
# if you want to use the temporal decoder of VAE, please uncomment the following codes | |
# vae = AutoencoderKLTemporalDecoder.from_pretrained("maxin-cn/Latte-1", subfolder="vae_temporal_decoder", torch_dtype=torch.float16).to(device) | |
# pipe.vae = vae | |
prompt = "a cat wearing sunglasses and working as a lifeguard at pool." | |
videos = pipe(prompt, video_length=video_length, output_type='pt').frames.cpu() | |
if video_length > 1: | |
videos = (videos.clamp(0, 1) * 255).to(dtype=torch.uint8) # convert to uint8 | |
imageio.mimwrite('./latte_output.mp4', videos[0].permute(0, 2, 3, 1), fps=8, quality=5) # highest quality is 10, lowest is 0 | |
else: | |
save_image(videos[0], './latte_output.png') | |
``` | |
## Inference with 4/8-bit quantization | |
[@Aryan](https://github.com/a-r-r-o-w) provides a quantization solution for inference, which can reduce GPU memory from 17 GB to 9 GB. Note that please install `bitsandbytes` (`pip install bitsandbytes`). | |
```bash | |
import gc | |
import torch | |
from diffusers import LattePipeline | |
from transformers import T5EncoderModel, BitsAndBytesConfig | |
import imageio | |
from torchvision.utils import save_image | |
torch.manual_seed(0) | |
def flush(): | |
gc.collect() | |
torch.cuda.empty_cache() | |
def bytes_to_giga_bytes(bytes): | |
return bytes / 1024 / 1024 / 1024 | |
video_length = 16 | |
model_id = "maxin-cn/Latte-1/" | |
text_encoder = T5EncoderModel.from_pretrained( | |
model_id, | |
subfolder="text_encoder", | |
quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16), | |
device_map="auto", | |
) | |
pipe = LattePipeline.from_pretrained( | |
model_id, | |
text_encoder=text_encoder, | |
transformer=None, | |
device_map="balanced", | |
) | |
with torch.no_grad(): | |
prompt = "a cat wearing sunglasses and working as a lifeguard at pool." | |
negative_prompt = "" | |
prompt_embeds, negative_prompt_embeds = pipe.encode_prompt(prompt, negative_prompt=negative_prompt) | |
del text_encoder | |
del pipe | |
flush() | |
pipe = LattePipeline.from_pretrained( | |
model_id, | |
text_encoder=None, | |
torch_dtype=torch.float16, | |
).to("cuda") | |
# pipe.enable_vae_tiling() | |
# pipe.enable_vae_slicing() | |
videos = pipe( | |
video_length=video_length, | |
num_inference_steps=50, | |
negative_prompt=None, | |
prompt_embeds=prompt_embeds, | |
negative_prompt_embeds=negative_prompt_embeds, | |
output_type="pt", | |
).frames.cpu() | |
print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB") | |
if video_length > 1: | |
videos = (videos.clamp(0, 1) * 255).to(dtype=torch.uint8) # convert to uint8 | |
imageio.mimwrite('./latte_output.mp4', videos[0].permute(0, 2, 3, 1), fps=8, quality=5) # highest quality is 10, lowest is 0 | |
else: | |
save_image(videos[0], './latte_output.png') | |
``` |