File size: 10,588 Bytes
ddb7519
f6f9150
ddb7519
 
 
5a57f66
d1f4ed8
f45086c
db8cbef
376d1c9
f6f9150
48571f9
 
ddb7519
 
 
00e8857
f248e7b
 
d1f4ed8
f6f9150
 
 
 
 
48571f9
 
f6f9150
48571f9
f6f9150
ddb7519
d1f4ed8
ddb7519
5a57f66
 
 
eea7935
0f38a31
 
 
eea7935
ddb7519
1a070e2
 
f45086c
 
5a57f66
 
f45086c
5a57f66
f45086c
5a57f66
ddb7519
48571f9
 
 
 
 
 
 
 
 
ddb7519
 
 
 
 
f45086c
e2ccff0
d1f4ed8
 
 
48571f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2ccff0
ddb7519
f248e7b
ddb7519
 
 
 
f248e7b
 
48571f9
 
f248e7b
 
 
48571f9
 
ddb7519
48571f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddb7519
d1f4ed8
ddb7519
 
 
 
 
 
 
 
 
 
 
 
 
00e8857
 
ddb7519
00e8857
 
376d1c9
00e8857
376d1c9
 
 
 
 
 
 
 
 
 
 
f6f9150
 
 
 
 
 
e2ccff0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# this is the huggingface handler file

from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
from omegaconf import OmegaConf
from huggingface_hub import hf_hub_download, try_to_load_from_cache

import os
import json
import base64

from safetensors import safe_open

from diffusers.utils.import_utils import is_xformers_available
from typing import Any
import torch
import imageio
import torchvision
import numpy as np
from einops import rearrange

from animatediff.models.unet import UNet3DConditionModel
from animatediff.pipelines.pipeline_animation import AnimationPipeline
from animatediff.utils.util import save_videos_grid
from animatediff.utils.util import load_weights
from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
from animatediff.utils.convert_lora_safetensor_to_diffusers import convert_lora

current_model = "backup"

class EndpointHandler():
    def __init__(self, model_path: str = "bluestarburst/AnimateDiff-SceneFusion"):
        
        # inference_config_path = "configs/inference/inference-v3.yaml"
        inference_config_path = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="configs/inference/inference-v3.yaml")
        print(inference_config_path)
        
        inference_config = OmegaConf.load(inference_config_path)
        
        # inference_config = {'unet_additional_kwargs': {'unet_use_cross_frame_attention': False, 'unet_use_temporal_attention': False, 'use_motion_module': True, 'motion_module_resolutions': [1, 2, 4, 8], 'motion_module_mid_block': False, 'motion_module_decoder_only': False, 'motion_module_type': 'Vanilla', 'motion_module_kwargs': {'num_attention_heads': 8, 'num_transformer_block': 1, 'attention_block_types': ['Temporal_Self', 'Temporal_Self'], 'temporal_position_encoding': True, 'temporal_position_encoding_max_len': 24, 'temporal_attention_dim_div': 1}}, 'noise_scheduler_kwargs': {'DDIMScheduler': {'num_train_timesteps': 1000, 'beta_start': 0.00085, 'beta_end': 0.012, 'beta_schedule': 'linear', 'steps_offset': 1, 'clip_sample': False}, 'EulerAncestralDiscreteScheduler': {'num_train_timesteps': 1000, 'beta_start': 0.00085, 'beta_end': 0.012, 'beta_schedule': 'linear'}, 'KDPM2AncestralDiscreteScheduler': {'num_train_timesteps': 1000, 'beta_start': 0.00085, 'beta_end': 0.012, 'beta_schedule': 'linear'}}}
        
        ### >>> create validation pipeline >>> ###
        tokenizer    = CLIPTokenizer.from_pretrained(model_path, subfolder="models/StableDiffusion/tokenizer")
        text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="models/StableDiffusion/text_encoder")
        vae          = AutoencoderKL.from_pretrained(model_path, subfolder="models/StableDiffusion/vae")         
        
        unet_model_path = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/StableDiffusion/unet/diffusion_pytorch_model.bin")
        unet_config_path = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/StableDiffusion/unet/config.json")        

        print(unet_model_path)

        unet         = UNet3DConditionModel.from_pretrained_2d(pretrained_model_path=unet_model_path, unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs), config_path=unet_config_path)

        # inv_latent_path = f"{OUTPUT_DIR}/inv_latents/ddim_latent-1.pt"
        inv_latent_path = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename=f"models/Motion_Module/{current_model}/inv_latents/ddim_latent-1.pt")
        self.latents = torch.load(inv_latent_path).to(torch.float)
        print(self.latents.shape, self.latents.dtype)
        
        # torch.backends.cuda.enable_mem_efficient_sdp(True)
        torch.backends.cuda.enable_flash_sdp(True)
        torch.backends.cuda.enable_math_sdp(True)

        if is_xformers_available(): unet.enable_xformers_memory_efficient_attention()
        else: assert False

        self.pipeline = AnimationPipeline(
            vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet,
            scheduler=DDIMScheduler(**OmegaConf.to_container(inference_config.noise_scheduler_kwargs.DDIMScheduler))
        ).to("cuda")
        
        # huggingface download motion module from bluestarburst/AnimateDiff-SceneFusion/models/Motion_Module/mm_sd_v15.ckpt

        # motion_module = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/Motion_Module/mm_sd_v15.ckpt")
        motion_module = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename=f"models/Motion_Module/{current_model}/mm.pth")
        # LORA_DREAMBOOTH_PATH="models/DreamBooth_LoRA/toonyou_beta3.safetensors"
        
        LORA_DREAMBOOTH_PATH = None
        LORA_DREAMBOOTH_PATH = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/DreamBooth_LoRA/toonyou_beta3.safetensors")

        # self.pipeline = load_weights(
        #     self.pipeline,
        #     # motion module
        #     motion_module_path         = motion_module,
        #     motion_module_lora_configs = [],
        #     # image layers
        #     dreambooth_model_path      = "",
        #     lora_model_path            = "",
        #     lora_alpha                 = 0.8,
        # ).to("cuda")
        
        motion_module_state_dict = torch.load(motion_module, map_location="cpu")
        missing, unexpected = self.pipeline.unet.load_state_dict(motion_module_state_dict, strict=False)
        assert len(unexpected) == 0
        
        
        # FIX THIS
        if LORA_DREAMBOOTH_PATH != "":
            if LORA_DREAMBOOTH_PATH.endswith(".ckpt"):
                state_dict = torch.load(LORA_DREAMBOOTH_PATH)
                self.pipeline.unet.load_state_dict(state_dict)

            elif LORA_DREAMBOOTH_PATH.endswith(".safetensors"):
                state_dict = {}
                with safe_open(LORA_DREAMBOOTH_PATH, framework="pt", device="cpu") as f:
                    for key in f.keys():
                        state_dict[key] = f.get_tensor(key)

                is_lora = all("lora" in k for k in state_dict.keys())
                if not is_lora:
                    base_state_dict = state_dict
                else:
                    base_state_dict = {}
                    with safe_open("", framework="pt", device="cpu") as f:
                        for key in f.keys():
                            base_state_dict[key] = f.get_tensor(key)

                # vae
                converted_vae_checkpoint = convert_ldm_vae_checkpoint(base_state_dict, self.pipeline.vae.config)
                self.pipeline.vae.load_state_dict(converted_vae_checkpoint)
                # unet
                converted_unet_checkpoint = convert_ldm_unet_checkpoint(base_state_dict, self.pipeline.unet.config)
                self.pipeline.unet.load_state_dict(converted_unet_checkpoint, strict=False)
                # text_model (TODO: problem here)
                # converted_test_encoder_checkpoint = convert_ldm_clip_checkpoint(base_state_dict)
                # pipeline.text_encoder = converted_test_encoder_checkpoint

                # import pdb
                # pdb.set_trace()
                if is_lora:
                    self.pipeline = convert_lora(self.pipeline, state_dict)
                    # self.pipeline = convert_lora(self.pipeline, state_dict, alpha=model_config.lora_alpha)

        self.pipeline.to("cuda")
    
    def __call__(self, data : Any):
        """
        __call__ method will be called once per request. This can be used to
        run inference.
        """
        
        prompt = data.pop("prompt", "")
        negative_prompt = data.pop("negative_prompt", "")
        negative_prompt += ",easynegative,bad_construction,bad_structure,bad_wail,bad_windows,blurry,cloned_window,cropped,deformed,disfigured,error,extra_windows,extra_chimney,extra_door,extra_structure,extra_frame,fewer_digits,fused_structure,gross_proportions,jpeg_artifacts,long_roof,low_quality,structure_limbs,missing_windows,missing_doors,missing_roofs,mutated_structure,mutation,normal_quality,out_of_frame,owres,poorly_drawn_structure,poorly_drawn_house,signature,text,too_many_windows,ugly,username,uta,watermark,worst_quality"
        steps = data.pop("steps", 25)
        guidance_scale = data.pop("guidance_scale", 12.5)
        
        print(f"current seed: {torch.initial_seed()}")
        print(f"sampling {prompt} ...")
        vids = self.pipeline(
            prompt,
            negative_prompt     = negative_prompt,
            num_inference_steps = steps,
            guidance_scale      = guidance_scale,
            width               = 256,
            height              = 256,
            video_length        = 5,
            latents             = self.latents,
        ).videos

        # vids = self.pipeline(
        #     prompt=prompt, 
        #     negative_prompt=negative_prompt, 
        #     num_inference_steps=steps, 
        #     guidance_scale=guidance_scale,
        #     width= 256,
        #     height= 256,
        #     video_length= 5,
        #     ).videos
        
        videos = rearrange(vids, "b c t h w -> t b c h w")
        n_rows=6
        fps=1
        loop = True
        rescale=False
        outputs = []
        for x in videos:
            x = torchvision.utils.make_grid(x, nrow=n_rows)
            x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
            if rescale:
                x = (x + 1.0) / 2.0  # -1,1 -> 0,1
            x = (x * 255).numpy().astype(np.uint8)
            outputs.append(x)
            
        path = "output.gif"
        imageio.mimsave(path, outputs, fps=fps)
        
        # open the file as binary and read the data
        with open(path, mode="rb") as file:
            file_content = file.read()
        # return json response with binary data
        # Encode the binary data using Base64
        base64_encoded_content = base64.b64encode(file_content).decode("utf-8")

        # Create a JSON object with the Base64-encoded content
        json_data = {
            "filename": "output.gif",
            "content": base64_encoded_content
        }

        # Convert the JSON object to a JSON-formatted string
        return json.dumps(json_data)
    

# This is the entry point for the serverless function.
# This function will be called during inference time.


# new_handler = EndpointHandler()