DeepBeepMeep
commited on
Commit
·
ccdbe4e
1
Parent(s):
5796089
more fixes
Browse files- hyvideo/text_encoder/__init__.py +2 -2
- ltx_video/ltxv.py +11 -5
- ltx_video/pipelines/crf_compressor.py +1 -1
- requirements.txt +1 -0
- wan/text2video.py +3 -3
- wgp.py +8 -6
hyvideo/text_encoder/__init__.py
CHANGED
|
@@ -189,9 +189,9 @@ class TextEncoder(nn.Module):
|
|
| 189 |
if "llm" in text_encoder_type:
|
| 190 |
from mmgp import offload
|
| 191 |
forcedConfigPath= None if "i2v" in text_encoder_type else "ckpts/llava-llama-3-8b/config.json"
|
| 192 |
-
self.model= offload.fast_load_transformers_model(self.model_path, forcedConfigPath=forcedConfigPath
|
| 193 |
if forcedConfigPath != None:
|
| 194 |
-
self.model.final_layer_norm = self.model.norm
|
| 195 |
|
| 196 |
else:
|
| 197 |
self.model, self.model_path = load_text_encoder(
|
|
|
|
| 189 |
if "llm" in text_encoder_type:
|
| 190 |
from mmgp import offload
|
| 191 |
forcedConfigPath= None if "i2v" in text_encoder_type else "ckpts/llava-llama-3-8b/config.json"
|
| 192 |
+
self.model= offload.fast_load_transformers_model(self.model_path, forcedConfigPath=forcedConfigPath)
|
| 193 |
if forcedConfigPath != None:
|
| 194 |
+
self.model.final_layer_norm = self.model.model.norm
|
| 195 |
|
| 196 |
else:
|
| 197 |
self.model, self.model_path = load_text_encoder(
|
ltx_video/ltxv.py
CHANGED
|
@@ -155,14 +155,14 @@ class LTXV:
|
|
| 155 |
):
|
| 156 |
|
| 157 |
self.mixed_precision_transformer = mixed_precision_transformer
|
| 158 |
-
|
| 159 |
# with safe_open(ckpt_path, framework="pt") as f:
|
| 160 |
# metadata = f.metadata()
|
| 161 |
# config_str = metadata.get("config")
|
| 162 |
# configs = json.loads(config_str)
|
| 163 |
# allowed_inference_steps = configs.get("allowed_inference_steps", None)
|
| 164 |
# transformer = Transformer3DModel.from_pretrained(ckpt_path)
|
| 165 |
-
# offload.
|
| 166 |
|
| 167 |
# vae = CausalVideoAutoencoder.from_pretrained(ckpt_path)
|
| 168 |
vae = offload.fast_load_transformers_model("ckpts/ltxv_0.9.7_VAE.safetensors", modelClass=CausalVideoAutoencoder)
|
|
@@ -174,8 +174,11 @@ class LTXV:
|
|
| 174 |
# vae = offload.fast_load_transformers_model("vae.safetensors", modelClass=CausalVideoAutoencoder, modelPrefix= "vae", forcedConfigPath="config_vae.json")
|
| 175 |
# offload.save_model(vae, "vae.safetensors", config_file_path="config_vae.json")
|
| 176 |
|
| 177 |
-
|
| 178 |
-
transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel)
|
|
|
|
|
|
|
|
|
|
| 179 |
transformer._model_dtype = dtype
|
| 180 |
if mixed_precision_transformer:
|
| 181 |
transformer._lock_dtype = torch.float
|
|
@@ -295,7 +298,10 @@ class LTXV:
|
|
| 295 |
conditioning_media_paths = None
|
| 296 |
conditioning_start_frames = None
|
| 297 |
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
| 299 |
# check if pipeline_config is a file
|
| 300 |
if not os.path.isfile(pipeline_config):
|
| 301 |
raise ValueError(f"Pipeline config file {pipeline_config} does not exist")
|
|
|
|
| 155 |
):
|
| 156 |
|
| 157 |
self.mixed_precision_transformer = mixed_precision_transformer
|
| 158 |
+
self.distilled = "distilled" in model_filepath[0]
|
| 159 |
# with safe_open(ckpt_path, framework="pt") as f:
|
| 160 |
# metadata = f.metadata()
|
| 161 |
# config_str = metadata.get("config")
|
| 162 |
# configs = json.loads(config_str)
|
| 163 |
# allowed_inference_steps = configs.get("allowed_inference_steps", None)
|
| 164 |
# transformer = Transformer3DModel.from_pretrained(ckpt_path)
|
| 165 |
+
# transformer = offload.fast_load_transformers_model("c:/temp/ltxdistilled/diffusion_pytorch_model-00001-of-00006.safetensors", forcedConfigPath="c:/temp/ltxdistilled/config.json")
|
| 166 |
|
| 167 |
# vae = CausalVideoAutoencoder.from_pretrained(ckpt_path)
|
| 168 |
vae = offload.fast_load_transformers_model("ckpts/ltxv_0.9.7_VAE.safetensors", modelClass=CausalVideoAutoencoder)
|
|
|
|
| 174 |
# vae = offload.fast_load_transformers_model("vae.safetensors", modelClass=CausalVideoAutoencoder, modelPrefix= "vae", forcedConfigPath="config_vae.json")
|
| 175 |
# offload.save_model(vae, "vae.safetensors", config_file_path="config_vae.json")
|
| 176 |
|
| 177 |
+
# model_filepath = "c:/temp/ltxd/ltxv-13b-0.9.7-distilled.safetensors"
|
| 178 |
+
transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel, forcedConfigPath= "c:/temp/ltxd/config.json")
|
| 179 |
+
# offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_distilled_bf16.safetensors", config_file_path= "c:/temp/ltxd/config.json")
|
| 180 |
+
# offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_distilled_quanto_bf16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/ltxd/config.json")
|
| 181 |
+
# transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel)
|
| 182 |
transformer._model_dtype = dtype
|
| 183 |
if mixed_precision_transformer:
|
| 184 |
transformer._lock_dtype = torch.float
|
|
|
|
| 298 |
conditioning_media_paths = None
|
| 299 |
conditioning_start_frames = None
|
| 300 |
|
| 301 |
+
if self.distilled :
|
| 302 |
+
pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-distilled.yaml"
|
| 303 |
+
else:
|
| 304 |
+
pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-dev.yaml"
|
| 305 |
# check if pipeline_config is a file
|
| 306 |
if not os.path.isfile(pipeline_config):
|
| 307 |
raise ValueError(f"Pipeline config file {pipeline_config} does not exist")
|
ltx_video/pipelines/crf_compressor.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
import torch
|
| 3 |
import io
|
| 4 |
import numpy as np
|
|
|
|
| 1 |
+
import av
|
| 2 |
import torch
|
| 3 |
import io
|
| 4 |
import numpy as np
|
requirements.txt
CHANGED
|
@@ -32,4 +32,5 @@ hydra-core
|
|
| 32 |
librosa
|
| 33 |
loguru
|
| 34 |
sentencepiece
|
|
|
|
| 35 |
# rembg==2.0.65
|
|
|
|
| 32 |
librosa
|
| 33 |
loguru
|
| 34 |
sentencepiece
|
| 35 |
+
av
|
| 36 |
# rembg==2.0.65
|
wan/text2video.py
CHANGED
|
@@ -80,9 +80,9 @@ class WanT2V:
|
|
| 80 |
|
| 81 |
logging.info(f"Creating WanModel from {model_filename[-1]}")
|
| 82 |
from mmgp import offload
|
| 83 |
-
# model_filename = "c:/temp/
|
| 84 |
# model_filename = "vace14B_quanto_bf16_int8.safetensors"
|
| 85 |
-
self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer, writable_tensors= False
|
| 86 |
# offload.load_model_data(self.model, "e:/vace.safetensors")
|
| 87 |
# offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth")
|
| 88 |
# self.model.to(torch.bfloat16)
|
|
@@ -90,7 +90,7 @@ class WanT2V:
|
|
| 90 |
self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype)
|
| 91 |
# dtype = torch.bfloat16
|
| 92 |
offload.change_dtype(self.model, dtype, True)
|
| 93 |
-
# offload.save_model(self.model, "
|
| 94 |
# offload.save_model(self.model, "vace14B_quanto_fp16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/vace/vace_config.json")
|
| 95 |
self.model.eval().requires_grad_(False)
|
| 96 |
|
|
|
|
| 80 |
|
| 81 |
logging.info(f"Creating WanModel from {model_filename[-1]}")
|
| 82 |
from mmgp import offload
|
| 83 |
+
# model_filename = "c:/temp/vace1.3/diffusion_pytorch_model.safetensors"
|
| 84 |
# model_filename = "vace14B_quanto_bf16_int8.safetensors"
|
| 85 |
+
self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer, writable_tensors= False , forcedConfigPath= "c:/temp/vace1.3/config.json")
|
| 86 |
# offload.load_model_data(self.model, "e:/vace.safetensors")
|
| 87 |
# offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth")
|
| 88 |
# self.model.to(torch.bfloat16)
|
|
|
|
| 90 |
self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype)
|
| 91 |
# dtype = torch.bfloat16
|
| 92 |
offload.change_dtype(self.model, dtype, True)
|
| 93 |
+
# offload.save_model(self.model, "wan2.1_Vace1.3B_mbf16.safetensors", config_file_path="c:/temp/vace1.3/config.json")
|
| 94 |
# offload.save_model(self.model, "vace14B_quanto_fp16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/vace/vace_config.json")
|
| 95 |
self.model.eval().requires_grad_(False)
|
| 96 |
|
wgp.py
CHANGED
|
@@ -1528,7 +1528,7 @@ wan_choices_i2v=["ckpts/wan2.1_image2video_480p_14B_mbf16.safetensors", "ckpts/w
|
|
| 1528 |
"ckpts/wan2.1_image2video_720p_14B_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_Fun_InP_1.3B_bf16.safetensors", "ckpts/wan2.1_Fun_InP_14B_bf16.safetensors",
|
| 1529 |
"ckpts/wan2.1_Fun_InP_14B_quanto_int8.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_bf16.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_quanto_int8.safetensors",
|
| 1530 |
"ckpts/wan2.1_fantasy_speaking_14B_bf16.safetensors"]
|
| 1531 |
-
ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors"]
|
| 1532 |
|
| 1533 |
hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors",
|
| 1534 |
"ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" ]
|
|
@@ -1539,12 +1539,12 @@ def get_dependent_models(model_filename, quantization, dtype_policy ):
|
|
| 1539 |
return [get_model_filename("i2v_720p", quantization, dtype_policy)]
|
| 1540 |
else:
|
| 1541 |
return []
|
| 1542 |
-
model_types = [ "t2v_1.3B", "vace_1.3B", "fun_inp_1.3B", "t2v", "i2v", "i2v_720p", "vace_14B", "fun_inp", "recam_1.3B", "flf2v_720p", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "phantom_1.3B", "fantasy", "ltxv_13B", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
|
| 1543 |
model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B",
|
| 1544 |
"i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B",
|
| 1545 |
"flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B",
|
| 1546 |
"sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B",
|
| 1547 |
-
"phantom_1.3B" : "phantom_1.3B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.
|
| 1548 |
|
| 1549 |
|
| 1550 |
def get_model_type(model_filename):
|
|
@@ -1606,10 +1606,12 @@ def get_model_name(model_filename, description_container = [""]):
|
|
| 1606 |
model_name = "Wan2.1 Fantasy Speaking 720p"
|
| 1607 |
model_name += " 14B" if "14B" in model_filename else " 1.3B"
|
| 1608 |
description = "The Fantasy Speaking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking extension to process an audio Input."
|
| 1609 |
-
elif "
|
| 1610 |
-
model_name = "LTX Video"
|
| 1611 |
-
model_name += " 0.9.7 13B" if "13B" in model_filename else " 0.9.6 2B"
|
| 1612 |
description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.7-dev.yaml'.The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer."
|
|
|
|
|
|
|
|
|
|
| 1613 |
elif "hunyuan_video_720" in model_filename:
|
| 1614 |
model_name = "Hunyuan Video text2video 720p"
|
| 1615 |
description = "Probably the best text 2 video model available."
|
|
|
|
| 1528 |
"ckpts/wan2.1_image2video_720p_14B_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_Fun_InP_1.3B_bf16.safetensors", "ckpts/wan2.1_Fun_InP_14B_bf16.safetensors",
|
| 1529 |
"ckpts/wan2.1_Fun_InP_14B_quanto_int8.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_bf16.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_quanto_int8.safetensors",
|
| 1530 |
"ckpts/wan2.1_fantasy_speaking_14B_bf16.safetensors"]
|
| 1531 |
+
ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors", "ckpts/ltxv_0.9.7_13B_distilled_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_distilled_quanto_bf16_int8.safetensors"]
|
| 1532 |
|
| 1533 |
hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors",
|
| 1534 |
"ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" ]
|
|
|
|
| 1539 |
return [get_model_filename("i2v_720p", quantization, dtype_policy)]
|
| 1540 |
else:
|
| 1541 |
return []
|
| 1542 |
+
model_types = [ "t2v_1.3B", "vace_1.3B", "fun_inp_1.3B", "t2v", "i2v", "i2v_720p", "vace_14B", "fun_inp", "recam_1.3B", "flf2v_720p", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "phantom_1.3B", "fantasy", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
|
| 1543 |
model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B",
|
| 1544 |
"i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B",
|
| 1545 |
"flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B",
|
| 1546 |
"sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B",
|
| 1547 |
+
"phantom_1.3B" : "phantom_1.3B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B_dev", "ltxv_13B_distilled" : "ltxv_0.9.7_13B_distilled", "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom" }
|
| 1548 |
|
| 1549 |
|
| 1550 |
def get_model_type(model_filename):
|
|
|
|
| 1606 |
model_name = "Wan2.1 Fantasy Speaking 720p"
|
| 1607 |
model_name += " 14B" if "14B" in model_filename else " 1.3B"
|
| 1608 |
description = "The Fantasy Speaking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking extension to process an audio Input."
|
| 1609 |
+
elif "ltxv_0.9.7_13B_dev" in model_filename:
|
| 1610 |
+
model_name = "LTX Video 0.9.7"
|
|
|
|
| 1611 |
description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.7-dev.yaml'.The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer."
|
| 1612 |
+
elif "ltxv_0.9.7_13B_distilled" in model_filename:
|
| 1613 |
+
model_name = "LTX Video 0.9.7 distilled"
|
| 1614 |
+
description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).This is the distilled / fast version. The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer."
|
| 1615 |
elif "hunyuan_video_720" in model_filename:
|
| 1616 |
model_name = "Hunyuan Video text2video 720p"
|
| 1617 |
description = "Probably the best text 2 video model available."
|