DeepBeepMeep commited on
Commit
ccdbe4e
·
1 Parent(s): 5796089

more fixes

Browse files
hyvideo/text_encoder/__init__.py CHANGED
@@ -189,9 +189,9 @@ class TextEncoder(nn.Module):
189
  if "llm" in text_encoder_type:
190
  from mmgp import offload
191
  forcedConfigPath= None if "i2v" in text_encoder_type else "ckpts/llava-llama-3-8b/config.json"
192
- self.model= offload.fast_load_transformers_model(self.model_path, forcedConfigPath=forcedConfigPath, modelPrefix= "model" if forcedConfigPath !=None else None)
193
  if forcedConfigPath != None:
194
- self.model.final_layer_norm = self.model.norm
195
 
196
  else:
197
  self.model, self.model_path = load_text_encoder(
 
189
  if "llm" in text_encoder_type:
190
  from mmgp import offload
191
  forcedConfigPath= None if "i2v" in text_encoder_type else "ckpts/llava-llama-3-8b/config.json"
192
+ self.model= offload.fast_load_transformers_model(self.model_path, forcedConfigPath=forcedConfigPath)
193
  if forcedConfigPath != None:
194
+ self.model.final_layer_norm = self.model.model.norm
195
 
196
  else:
197
  self.model, self.model_path = load_text_encoder(
ltx_video/ltxv.py CHANGED
@@ -155,14 +155,14 @@ class LTXV:
155
  ):
156
 
157
  self.mixed_precision_transformer = mixed_precision_transformer
158
- # ckpt_path = Path(ckpt_path)
159
  # with safe_open(ckpt_path, framework="pt") as f:
160
  # metadata = f.metadata()
161
  # config_str = metadata.get("config")
162
  # configs = json.loads(config_str)
163
  # allowed_inference_steps = configs.get("allowed_inference_steps", None)
164
  # transformer = Transformer3DModel.from_pretrained(ckpt_path)
165
- # offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", config_file_path="config_transformer.json")
166
 
167
  # vae = CausalVideoAutoencoder.from_pretrained(ckpt_path)
168
  vae = offload.fast_load_transformers_model("ckpts/ltxv_0.9.7_VAE.safetensors", modelClass=CausalVideoAutoencoder)
@@ -174,8 +174,11 @@ class LTXV:
174
  # vae = offload.fast_load_transformers_model("vae.safetensors", modelClass=CausalVideoAutoencoder, modelPrefix= "vae", forcedConfigPath="config_vae.json")
175
  # offload.save_model(vae, "vae.safetensors", config_file_path="config_vae.json")
176
 
177
-
178
- transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel)
 
 
 
179
  transformer._model_dtype = dtype
180
  if mixed_precision_transformer:
181
  transformer._lock_dtype = torch.float
@@ -295,7 +298,10 @@ class LTXV:
295
  conditioning_media_paths = None
296
  conditioning_start_frames = None
297
 
298
- pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-dev.yaml"
 
 
 
299
  # check if pipeline_config is a file
300
  if not os.path.isfile(pipeline_config):
301
  raise ValueError(f"Pipeline config file {pipeline_config} does not exist")
 
155
  ):
156
 
157
  self.mixed_precision_transformer = mixed_precision_transformer
158
+ self.distilled = "distilled" in model_filepath[0]
159
  # with safe_open(ckpt_path, framework="pt") as f:
160
  # metadata = f.metadata()
161
  # config_str = metadata.get("config")
162
  # configs = json.loads(config_str)
163
  # allowed_inference_steps = configs.get("allowed_inference_steps", None)
164
  # transformer = Transformer3DModel.from_pretrained(ckpt_path)
165
+ # transformer = offload.fast_load_transformers_model("c:/temp/ltxdistilled/diffusion_pytorch_model-00001-of-00006.safetensors", forcedConfigPath="c:/temp/ltxdistilled/config.json")
166
 
167
  # vae = CausalVideoAutoencoder.from_pretrained(ckpt_path)
168
  vae = offload.fast_load_transformers_model("ckpts/ltxv_0.9.7_VAE.safetensors", modelClass=CausalVideoAutoencoder)
 
174
  # vae = offload.fast_load_transformers_model("vae.safetensors", modelClass=CausalVideoAutoencoder, modelPrefix= "vae", forcedConfigPath="config_vae.json")
175
  # offload.save_model(vae, "vae.safetensors", config_file_path="config_vae.json")
176
 
177
+ # model_filepath = "c:/temp/ltxd/ltxv-13b-0.9.7-distilled.safetensors"
178
+ transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel, forcedConfigPath= "c:/temp/ltxd/config.json")
179
+ # offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_distilled_bf16.safetensors", config_file_path= "c:/temp/ltxd/config.json")
180
+ # offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_distilled_quanto_bf16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/ltxd/config.json")
181
+ # transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel)
182
  transformer._model_dtype = dtype
183
  if mixed_precision_transformer:
184
  transformer._lock_dtype = torch.float
 
298
  conditioning_media_paths = None
299
  conditioning_start_frames = None
300
 
301
+ if self.distilled :
302
+ pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-distilled.yaml"
303
+ else:
304
+ pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-dev.yaml"
305
  # check if pipeline_config is a file
306
  if not os.path.isfile(pipeline_config):
307
  raise ValueError(f"Pipeline config file {pipeline_config} does not exist")
ltx_video/pipelines/crf_compressor.py CHANGED
@@ -1,4 +1,4 @@
1
- # import av
2
  import torch
3
  import io
4
  import numpy as np
 
1
+ import av
2
  import torch
3
  import io
4
  import numpy as np
requirements.txt CHANGED
@@ -32,4 +32,5 @@ hydra-core
32
  librosa
33
  loguru
34
  sentencepiece
 
35
  # rembg==2.0.65
 
32
  librosa
33
  loguru
34
  sentencepiece
35
+ av
36
  # rembg==2.0.65
wan/text2video.py CHANGED
@@ -80,9 +80,9 @@ class WanT2V:
80
 
81
  logging.info(f"Creating WanModel from {model_filename[-1]}")
82
  from mmgp import offload
83
- # model_filename = "c:/temp/vace/diffusion_pytorch_model-00001-of-00007.safetensors"
84
  # model_filename = "vace14B_quanto_bf16_int8.safetensors"
85
- self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer, writable_tensors= False) # , forcedConfigPath= "c:/temp/vace/vace_config.json")
86
  # offload.load_model_data(self.model, "e:/vace.safetensors")
87
  # offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth")
88
  # self.model.to(torch.bfloat16)
@@ -90,7 +90,7 @@ class WanT2V:
90
  self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype)
91
  # dtype = torch.bfloat16
92
  offload.change_dtype(self.model, dtype, True)
93
- # offload.save_model(self.model, "vace14B_bf16.safetensors", config_file_path="c:/temp/vace/vace_config.json")
94
  # offload.save_model(self.model, "vace14B_quanto_fp16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/vace/vace_config.json")
95
  self.model.eval().requires_grad_(False)
96
 
 
80
 
81
  logging.info(f"Creating WanModel from {model_filename[-1]}")
82
  from mmgp import offload
83
+ # model_filename = "c:/temp/vace1.3/diffusion_pytorch_model.safetensors"
84
  # model_filename = "vace14B_quanto_bf16_int8.safetensors"
85
+ self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer, writable_tensors= False , forcedConfigPath= "c:/temp/vace1.3/config.json")
86
  # offload.load_model_data(self.model, "e:/vace.safetensors")
87
  # offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth")
88
  # self.model.to(torch.bfloat16)
 
90
  self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype)
91
  # dtype = torch.bfloat16
92
  offload.change_dtype(self.model, dtype, True)
93
+ # offload.save_model(self.model, "wan2.1_Vace1.3B_mbf16.safetensors", config_file_path="c:/temp/vace1.3/config.json")
94
  # offload.save_model(self.model, "vace14B_quanto_fp16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/vace/vace_config.json")
95
  self.model.eval().requires_grad_(False)
96
 
wgp.py CHANGED
@@ -1528,7 +1528,7 @@ wan_choices_i2v=["ckpts/wan2.1_image2video_480p_14B_mbf16.safetensors", "ckpts/w
1528
  "ckpts/wan2.1_image2video_720p_14B_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_Fun_InP_1.3B_bf16.safetensors", "ckpts/wan2.1_Fun_InP_14B_bf16.safetensors",
1529
  "ckpts/wan2.1_Fun_InP_14B_quanto_int8.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_bf16.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_quanto_int8.safetensors",
1530
  "ckpts/wan2.1_fantasy_speaking_14B_bf16.safetensors"]
1531
- ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors"]
1532
 
1533
  hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors",
1534
  "ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" ]
@@ -1539,12 +1539,12 @@ def get_dependent_models(model_filename, quantization, dtype_policy ):
1539
  return [get_model_filename("i2v_720p", quantization, dtype_policy)]
1540
  else:
1541
  return []
1542
- model_types = [ "t2v_1.3B", "vace_1.3B", "fun_inp_1.3B", "t2v", "i2v", "i2v_720p", "vace_14B", "fun_inp", "recam_1.3B", "flf2v_720p", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "phantom_1.3B", "fantasy", "ltxv_13B", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
1543
  model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B",
1544
  "i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B",
1545
  "flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B",
1546
  "sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B",
1547
- "phantom_1.3B" : "phantom_1.3B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B", "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom" }
1548
 
1549
 
1550
  def get_model_type(model_filename):
@@ -1606,10 +1606,12 @@ def get_model_name(model_filename, description_container = [""]):
1606
  model_name = "Wan2.1 Fantasy Speaking 720p"
1607
  model_name += " 14B" if "14B" in model_filename else " 1.3B"
1608
  description = "The Fantasy Speaking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking extension to process an audio Input."
1609
- elif "ltxv" in model_filename:
1610
- model_name = "LTX Video"
1611
- model_name += " 0.9.7 13B" if "13B" in model_filename else " 0.9.6 2B"
1612
  description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.7-dev.yaml'.The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer."
 
 
 
1613
  elif "hunyuan_video_720" in model_filename:
1614
  model_name = "Hunyuan Video text2video 720p"
1615
  description = "Probably the best text 2 video model available."
 
1528
  "ckpts/wan2.1_image2video_720p_14B_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_Fun_InP_1.3B_bf16.safetensors", "ckpts/wan2.1_Fun_InP_14B_bf16.safetensors",
1529
  "ckpts/wan2.1_Fun_InP_14B_quanto_int8.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_bf16.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_quanto_int8.safetensors",
1530
  "ckpts/wan2.1_fantasy_speaking_14B_bf16.safetensors"]
1531
+ ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors", "ckpts/ltxv_0.9.7_13B_distilled_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_distilled_quanto_bf16_int8.safetensors"]
1532
 
1533
  hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors",
1534
  "ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" ]
 
1539
  return [get_model_filename("i2v_720p", quantization, dtype_policy)]
1540
  else:
1541
  return []
1542
+ model_types = [ "t2v_1.3B", "vace_1.3B", "fun_inp_1.3B", "t2v", "i2v", "i2v_720p", "vace_14B", "fun_inp", "recam_1.3B", "flf2v_720p", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "phantom_1.3B", "fantasy", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
1543
  model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B",
1544
  "i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B",
1545
  "flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B",
1546
  "sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B",
1547
+ "phantom_1.3B" : "phantom_1.3B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B_dev", "ltxv_13B_distilled" : "ltxv_0.9.7_13B_distilled", "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom" }
1548
 
1549
 
1550
  def get_model_type(model_filename):
 
1606
  model_name = "Wan2.1 Fantasy Speaking 720p"
1607
  model_name += " 14B" if "14B" in model_filename else " 1.3B"
1608
  description = "The Fantasy Speaking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking extension to process an audio Input."
1609
+ elif "ltxv_0.9.7_13B_dev" in model_filename:
1610
+ model_name = "LTX Video 0.9.7"
 
1611
  description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.7-dev.yaml'.The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer."
1612
+ elif "ltxv_0.9.7_13B_distilled" in model_filename:
1613
+ model_name = "LTX Video 0.9.7 distilled"
1614
+ description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).This is the distilled / fast version. The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer."
1615
  elif "hunyuan_video_720" in model_filename:
1616
  model_name = "Hunyuan Video text2video 720p"
1617
  description = "Probably the best text 2 video model available."