eeuuia commited on
Commit
a18d7a7
·
verified ·
1 Parent(s): da423da

Update LTX-Video/ltx_video/pipelines/pipeline_ltx_video.py

Browse files
LTX-Video/ltx_video/pipelines/pipeline_ltx_video.py CHANGED
@@ -107,11 +107,6 @@ class SpyLatent:
107
  necessária se o tensor de entrada for 3D.
108
  save_visual (bool): Se True, decodifica com o VAE e salva uma imagem.
109
  """
110
- #print(f"\n--- [INSPEÇÃO DE LATENTE: {tag}] ---")
111
- #if not isinstance(tensor, torch.Tensor):
112
- # print(f" AVISO: O objeto fornecido para '{tag}' não é um tensor.")
113
- # print("--- [FIM DA INSPEÇÃO] ---\n")
114
- # return
115
 
116
  try:
117
  # --- Imprime Estatísticas do Tensor Original ---
@@ -120,7 +115,7 @@ class SpyLatent:
120
  # --- Converte para 5D se necessário ---
121
  tensor_5d = self._to_5d(tensor, reference_shape_5d)
122
  if tensor_5d is not None and tensor.ndim == 3:
123
- self._print_stats("Convertido para 5D", tensor_5d)
124
 
125
  # --- Visualização com VAE ---
126
  if save_visual and self.vae is not None and tensor_5d is not None:
@@ -129,7 +124,7 @@ class SpyLatent:
129
 
130
  frame_idx_to_viz = min(1, tensor_5d.shape[2] - 1)
131
  if frame_idx_to_viz < 0:
132
- print(" VISUALIZAÇÃO (VAE): Tensor não tem frames para visualizar.")
133
  else:
134
  #print(f" VISUALIZAÇÃO (VAE): Usando frame de índice {frame_idx_to_viz}.")
135
  latent_slice = tensor_5d[:, :, frame_idx_to_viz:frame_idx_to_viz+1, :, :]
@@ -138,7 +133,7 @@ class SpyLatent:
138
  pixel_slice = self.vae.decode(latent_slice / self.vae.config.scaling_factor).sample
139
 
140
  save_image((pixel_slice / 2 + 0.5).clamp(0, 1), os.path.join(self.output_dir, f"inspect_{tag.lower()}.png"))
141
- print(" VISUALIZAÇÃO (VAE): Imagem salva.")
142
 
143
  except Exception as e:
144
  #print(f" ERRO na inspeção: {e}")
@@ -163,7 +158,7 @@ class SpyLatent:
163
  std = tensor.std().item()
164
  min_val = tensor.min().item()
165
  max_val = tensor.max().item()
166
- print(f" {prefix}: {tensor.shape}")
167
 
168
 
169
 
@@ -1086,11 +1081,7 @@ class LTXVideoPipeline(DiffusionPipeline):
1086
  **retrieve_timesteps_kwargs,
1087
  )
1088
 
1089
- try:
1090
- print(f"[LTX2]LATENTS {latents.shape}")
1091
- except Exception:
1092
- pass
1093
-
1094
  if self.allowed_inference_steps is not None:
1095
  for timestep in [round(x, 4) for x in timesteps.tolist()]:
1096
  assert (
@@ -1159,11 +1150,7 @@ class LTXVideoPipeline(DiffusionPipeline):
1159
  max_new_tokens=text_encoder_max_tokens,
1160
  )
1161
 
1162
- try:
1163
- print(f"[LTX3]LATENTS {latents.shape}")
1164
- except Exception:
1165
- pass
1166
-
1167
  # 3. Encode input prompt
1168
  if self.text_encoder is not None:
1169
  self.text_encoder = self.text_encoder.to(self._execution_device)
@@ -1230,7 +1217,7 @@ class LTXVideoPipeline(DiffusionPipeline):
1230
  )
1231
 
1232
  try:
1233
- print(f"[LTX4]LATENTS {latents.shape}")
1234
  original_shape = latents
1235
  except Exception:
1236
  pass
@@ -1252,20 +1239,11 @@ class LTXVideoPipeline(DiffusionPipeline):
1252
  init_latents = latents.clone() # Used for image_cond_noise_update
1253
 
1254
  try:
1255
- print(f"[LTXCond]conditioning_mask {conditioning_mask.shape}")
1256
- except Exception:
1257
- pass
1258
-
1259
- try:
1260
- print(f"[LTXCond]pixel_coords {pixel_coords.shape}")
1261
- except Exception:
1262
- pass
1263
-
1264
- try:
1265
- print(f"[LTXCond]pixel_coords {pixel_coords.shape}")
1266
  except Exception:
1267
  pass
1268
 
 
1269
 
1270
 
1271
 
@@ -1273,10 +1251,6 @@ class LTXVideoPipeline(DiffusionPipeline):
1273
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1274
 
1275
 
1276
- try:
1277
- print(f"[LTX5]LATENTS {latents.shape}")
1278
- except Exception:
1279
- pass
1280
 
1281
  # 7. Denoising loop
1282
  num_warmup_steps = max(
@@ -1337,11 +1311,7 @@ class LTXVideoPipeline(DiffusionPipeline):
1337
  generator,
1338
  )
1339
 
1340
- try:
1341
- print(f"[LTX6]LATENTS {latents.shape}")
1342
- self.spy.inspect(latents, "LTX6_After_Patchify", reference_shape_5d=original_shape)
1343
- except Exception:
1344
- pass
1345
 
1346
 
1347
 
@@ -1352,11 +1322,7 @@ class LTXVideoPipeline(DiffusionPipeline):
1352
  latent_model_input, t
1353
  )
1354
 
1355
- try:
1356
- print(f"[LTX7]LATENTS {latent_model_input.shape}")
1357
- self.spy.inspect(latents, "LTX7_After_Patchify", reference_shape_5d=original_shape)
1358
- except Exception:
1359
- pass
1360
 
1361
  current_timestep = t
1362
  if not torch.is_tensor(current_timestep):
@@ -1473,12 +1439,7 @@ class LTXVideoPipeline(DiffusionPipeline):
1473
  stochastic_sampling=stochastic_sampling,
1474
  )
1475
 
1476
- try:
1477
- print(f"[LTX8]LATENTS {latents.shape}")
1478
- self.spy.inspect(latents, "LTX8_After_Patchify", reference_shape_5d=original_shape)
1479
- except Exception:
1480
- pass
1481
-
1482
  # call the callback, if provided
1483
  if i == len(timesteps) - 1 or (
1484
  (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
@@ -1490,12 +1451,7 @@ class LTXVideoPipeline(DiffusionPipeline):
1490
 
1491
 
1492
 
1493
- try:
1494
- print(f"[LTX9]LATENTS {latents.shape}")
1495
- self.spy.inspect(latents, "LTX9_After_Patchify", reference_shape_5d=original_shape)
1496
-
1497
- except Exception:
1498
- pass
1499
 
1500
 
1501
  if offload_to_cpu:
@@ -1507,11 +1463,7 @@ class LTXVideoPipeline(DiffusionPipeline):
1507
  latents = latents[:, num_cond_latents:]
1508
 
1509
 
1510
- try:
1511
- print(f"[LTX10]LATENTS {latents.shape}")
1512
- self.spy.inspect(latents, "LTX10_After_Patchify", reference_shape_5d=original_shape)
1513
- except Exception:
1514
- pass
1515
 
1516
  latents = self.patchifier.unpatchify(
1517
  latents=latents,
@@ -1520,6 +1472,14 @@ class LTXVideoPipeline(DiffusionPipeline):
1520
  out_channels=self.transformer.in_channels
1521
  // math.prod(self.patchifier.patch_size),
1522
  )
 
 
 
 
 
 
 
 
1523
  if output_type != "latent":
1524
  if self.vae.decoder.timestep_conditioning:
1525
  noise = torch.randn_like(latents)
@@ -1549,7 +1509,7 @@ class LTXVideoPipeline(DiffusionPipeline):
1549
  )
1550
 
1551
  try:
1552
- print(f"[LTX11]LATENTS {latents.shape}")
1553
  except Exception:
1554
  pass
1555
 
 
107
  necessária se o tensor de entrada for 3D.
108
  save_visual (bool): Se True, decodifica com o VAE e salva uma imagem.
109
  """
 
 
 
 
 
110
 
111
  try:
112
  # --- Imprime Estatísticas do Tensor Original ---
 
115
  # --- Converte para 5D se necessário ---
116
  tensor_5d = self._to_5d(tensor, reference_shape_5d)
117
  if tensor_5d is not None and tensor.ndim == 3:
118
+ #self._print_stats("Convertido para 5D", tensor_5d)
119
 
120
  # --- Visualização com VAE ---
121
  if save_visual and self.vae is not None and tensor_5d is not None:
 
124
 
125
  frame_idx_to_viz = min(1, tensor_5d.shape[2] - 1)
126
  if frame_idx_to_viz < 0:
127
+ #print(" VISUALIZAÇÃO (VAE): Tensor não tem frames para visualizar.")
128
  else:
129
  #print(f" VISUALIZAÇÃO (VAE): Usando frame de índice {frame_idx_to_viz}.")
130
  latent_slice = tensor_5d[:, :, frame_idx_to_viz:frame_idx_to_viz+1, :, :]
 
133
  pixel_slice = self.vae.decode(latent_slice / self.vae.config.scaling_factor).sample
134
 
135
  save_image((pixel_slice / 2 + 0.5).clamp(0, 1), os.path.join(self.output_dir, f"inspect_{tag.lower()}.png"))
136
+ #print(" VISUALIZAÇÃO (VAE): Imagem salva.")
137
 
138
  except Exception as e:
139
  #print(f" ERRO na inspeção: {e}")
 
158
  std = tensor.std().item()
159
  min_val = tensor.min().item()
160
  max_val = tensor.max().item()
161
+ print(f"{tensor.shape}")
162
 
163
 
164
 
 
1081
  **retrieve_timesteps_kwargs,
1082
  )
1083
 
1084
+
 
 
 
 
1085
  if self.allowed_inference_steps is not None:
1086
  for timestep in [round(x, 4) for x in timesteps.tolist()]:
1087
  assert (
 
1150
  max_new_tokens=text_encoder_max_tokens,
1151
  )
1152
 
1153
+
 
 
 
 
1154
  # 3. Encode input prompt
1155
  if self.text_encoder is not None:
1156
  self.text_encoder = self.text_encoder.to(self._execution_device)
 
1217
  )
1218
 
1219
  try:
1220
+ print(f"[LTX]RUIDO-LATENTS-INICIAL {latents.shape}")
1221
  original_shape = latents
1222
  except Exception:
1223
  pass
 
1239
  init_latents = latents.clone() # Used for image_cond_noise_update
1240
 
1241
  try:
1242
+ print(f"[LTXCond]conditioning_items {conditioning_items.shape}")
 
 
 
 
 
 
 
 
 
 
1243
  except Exception:
1244
  pass
1245
 
1246
+
1247
 
1248
 
1249
 
 
1251
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1252
 
1253
 
 
 
 
 
1254
 
1255
  # 7. Denoising loop
1256
  num_warmup_steps = max(
 
1311
  generator,
1312
  )
1313
 
1314
+
 
 
 
 
1315
 
1316
 
1317
 
 
1322
  latent_model_input, t
1323
  )
1324
 
1325
+
 
 
 
 
1326
 
1327
  current_timestep = t
1328
  if not torch.is_tensor(current_timestep):
 
1439
  stochastic_sampling=stochastic_sampling,
1440
  )
1441
 
1442
+
 
 
 
 
 
1443
  # call the callback, if provided
1444
  if i == len(timesteps) - 1 or (
1445
  (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
 
1451
 
1452
 
1453
 
1454
+
 
 
 
 
 
1455
 
1456
 
1457
  if offload_to_cpu:
 
1463
  latents = latents[:, num_cond_latents:]
1464
 
1465
 
1466
+
 
 
 
 
1467
 
1468
  latents = self.patchifier.unpatchify(
1469
  latents=latents,
 
1472
  out_channels=self.transformer.in_channels
1473
  // math.prod(self.patchifier.patch_size),
1474
  )
1475
+
1476
+
1477
+ try:
1478
+ print(f"[LTX10]LATENTS Fim{latents.shape}")
1479
+ #self.spy.inspect(latents, "LTX_After_Patchify", reference_shape_5d=original_shape)
1480
+ except Exception:
1481
+ pass
1482
+
1483
  if output_type != "latent":
1484
  if self.vae.decoder.timestep_conditioning:
1485
  noise = torch.randn_like(latents)
 
1509
  )
1510
 
1511
  try:
1512
+ print(f"[LTX11]LATENTS_pix_fim{latents.shape}")
1513
  except Exception:
1514
  pass
1515