Test

Paused

App Files Files Community

eeuuia commited on Oct 12

Commit

ecd2b0d

verified ·

1 Parent(s): ecd3981

Update api/ltx/ltx_aduc_manager.py

Browse files

Files changed (1) hide show

api/ltx/ltx_aduc_manager.py +36 -22

api/ltx/ltx_aduc_manager.py CHANGED Viewed

@@ -9,10 +9,11 @@ from pathlib import Path
 import threading
 import queue
 import time
 from typing import List, Optional, Callable, Any, Tuple
 # Imports dos builders e do gpu_manager
-from api.ltx.ltx.ltx_utils import get_main_ltx_pipeline, get_main_vae
 from managers.gpu_manager import gpu_manager
 # --- Adiciona o path do LTX-Video para importação de tipos ---
@@ -63,24 +64,42 @@ class LTXMainWorker(BaseWorker):
     def __init__(self, worker_id: int, device: torch.device):
         super().__init__(worker_id, device)
         self.pipeline: Optional[LTXVideoPipeline] = None
     def _load_models(self):
         logging.info(f"[LTXWorker-{self.worker_id}] Loading models to CPU...")
         self.pipeline = get_main_ltx_pipeline()
         logging.info(f"[LTXWorker-{self.worker_id}] Moving pipeline to {self.device}...")
         self.pipeline.to(self.device)
     def execute(self, job_func: Callable, args: tuple, kwargs: dict) -> Any:
-        """Executa um trabalho, gerenciando o estado 'busy'."""
         self.is_busy = True
         logging.info(f"Worker {self.worker_id} (LTX) starting job: {job_func.__name__}")
         try:
-            result = job_func(self.pipeline, *args, **kwargs)
             logging.info(f"Worker {self.worker_id} (LTX) finished job successfully.")
             return result
         except Exception as e:
             logging.error(f"Worker {self.worker_id} (LTX) job failed!", exc_info=True)
-            self.is_healthy = False # Falha em um job marca o worker como não saudável
             raise
         finally:
             self.is_busy = False
@@ -99,7 +118,6 @@ class VAEWorker(BaseWorker):
         self.vae.eval()
     def execute(self, job_func: Callable, args: tuple, kwargs: dict) -> Any:
-        """Executa um trabalho, gerenciando o estado 'busy'."""
         self.is_busy = True
         logging.info(f"Worker {self.worker_id} (VAE) starting job: {job_func.__name__}")
         try:
@@ -138,7 +156,6 @@ class LTXAducManager:
         self._initialize_workers()
-        # Inicia threads consumidores para processar as filas
         self.ltx_dispatcher = threading.Thread(target=self._dispatch_jobs, args=(self.ltx_job_queue, self.ltx_workers), daemon=True)
         self.vae_dispatcher = threading.Thread(target=self._dispatch_jobs, args=(self.vae_job_queue, self.vae_workers), daemon=True)
         self.health_monitor = threading.Thread(target=self._health_check_loop, daemon=True)
@@ -152,17 +169,16 @@ class LTXAducManager:
     def _initialize_workers(self):
         """Cria e inicia os workers com base nas GPUs alocadas."""
-        # Supondo que gpu_manager agora tenha get_ltx_devices() e get_seedvr_devices() que retornam listas
-        ltx_gpus = gpu_manager.get_ltx_device() # Ajuste se o nome for diferente
-        vae_gpus = gpu_manager.get_ltx_vae_device() # Ajuste se o nome for diferente
         with self.pool_lock:
-            for i, device_id in enumerate([ltx_gpus]): # Assumindo que retorna uma lista
                 worker = LTXMainWorker(worker_id=i, device=torch.device(f"cuda:{device_id}"))
                 self.ltx_workers.append(worker)
                 worker.start()
-            for i, device_id in enumerate([vae_gpus]): # Assumindo que retorna uma lista
                 worker = VAEWorker(worker_id=i, device=torch.device(f"cuda:{device_id}"))
                 self.vae_workers.append(worker)
                 worker.start()
@@ -170,6 +186,8 @@ class LTXAducManager:
     def _get_available_worker(self, worker_pool: List[BaseWorker]) -> Optional[BaseWorker]:
         """Encontra um worker saudável e desocupado no pool."""
         with self.pool_lock:
             for worker in worker_pool:
                 healthy, busy = worker.get_status()
                 if healthy and not busy:
@@ -184,7 +202,7 @@ class LTXAducManager:
             while worker is None:
                 worker = self._get_available_worker(worker_pool)
                 if worker is None:
-                    time.sleep(0.1) # Espera por um worker ficar livre
             try:
                 result = worker.execute(job_func, args, kwargs)
@@ -200,36 +218,32 @@ class LTXAducManager:
             with self.pool_lock:
                 for i, worker in enumerate(self.ltx_workers):
                     if not worker.is_alive() or not worker.is_healthy:
-                        logging.warning(f"LTX Worker {worker.worker_id} on {worker.device} is UNHEALTHY. Restarting...")
                         new_worker = LTXMainWorker(worker.worker_id, worker.device)
                         self.ltx_workers[i] = new_worker
                         new_worker.start()
-                # Repetir o laço para VAE workers
                 for i, worker in enumerate(self.vae_workers):
                     if not worker.is_alive() or not worker.is_healthy:
-                        logging.warning(f"VAE Worker {worker.worker_id} on {worker.device} is UNHEALTHY. Restarting...")
                         new_worker = VAEWorker(worker.worker_id, worker.device)
                         self.vae_workers[i] = new_worker
                         new_worker.start()
     def submit_job(self, job_type: str, job_func: Callable, *args, **kwargs) -> Any:
-        """
-        Ponto de entrada público para submeter um trabalho ao pool.
-        Esta função é síncrona: ela espera pelo resultado.
-        """
         if job_type not in ['ltx', 'vae']:
             raise ValueError("Invalid job_type. Must be 'ltx' or 'vae'.")
         job_queue = self.ltx_job_queue if job_type == 'ltx' else self.vae_job_queue
-        future = queue.Queue() # Usamos uma fila como um 'future' para obter o resultado de volta
         job_queue.put((job_func, args, kwargs, future))
-        # Bloqueia e espera pelo resultado ser colocado no 'future' pelo dispatcher
         result = future.get()
         if isinstance(result, Exception):
-            raise result # Se o job falhou, re-lança a exceção no thread principal
         return result

 import threading
 import queue
 import time
+import yaml
 from typing import List, Optional, Callable, Any, Tuple
 # Imports dos builders e do gpu_manager
+from api.ltx.ltx_utils import get_main_ltx_pipeline, get_main_vae
 from managers.gpu_manager import gpu_manager
 # --- Adiciona o path do LTX-Video para importação de tipos ---
     def __init__(self, worker_id: int, device: torch.device):
         super().__init__(worker_id, device)
         self.pipeline: Optional[LTXVideoPipeline] = None
+        self.autocast_dtype: torch.dtype = torch.float32
     def _load_models(self):
         logging.info(f"[LTXWorker-{self.worker_id}] Loading models to CPU...")
         self.pipeline = get_main_ltx_pipeline()
+        self._set_precision_policy()
         logging.info(f"[LTXWorker-{self.worker_id}] Moving pipeline to {self.device}...")
         self.pipeline.to(self.device)
+    def _set_precision_policy(self):
+        """Determina o dtype para o torch.autocast com base na config."""
+        try:
+            config_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled-fp8.yaml"
+            with open(config_path, "r") as file:
+                config = yaml.safe_load(file)
+            precision = str(config.get("precision", "bfloat16")).lower()
+            if precision in ["float8_e4m3fn", "bfloat16"]:
+                self.autocast_dtype = torch.bfloat16
+            elif precision == "mixed_precision":
+                self.autocast_dtype = torch.float16
+            logging.info(f"[LTXWorker-{self.worker_id}] Autocast precision policy set to {self.autocast_dtype}")
+        except Exception as e:
+            logging.warning(f"[LTXWorker-{self.worker_id}] Could not set precision policy from config. Defaulting to float32. Error: {e}")
+            self.autocast_dtype = torch.float32
     def execute(self, job_func: Callable, args: tuple, kwargs: dict) -> Any:
         self.is_busy = True
         logging.info(f"Worker {self.worker_id} (LTX) starting job: {job_func.__name__}")
         try:
+            # Passa a sua própria instância do pipeline e o dtype para a função do job
+            result = job_func(self.pipeline, self.autocast_dtype, *args, **kwargs)
             logging.info(f"Worker {self.worker_id} (LTX) finished job successfully.")
             return result
         except Exception as e:
             logging.error(f"Worker {self.worker_id} (LTX) job failed!", exc_info=True)
+            self.is_healthy = False
             raise
         finally:
             self.is_busy = False
         self.vae.eval()
     def execute(self, job_func: Callable, args: tuple, kwargs: dict) -> Any:
         self.is_busy = True
         logging.info(f"Worker {self.worker_id} (VAE) starting job: {job_func.__name__}")
         try:
         self._initialize_workers()
         self.ltx_dispatcher = threading.Thread(target=self._dispatch_jobs, args=(self.ltx_job_queue, self.ltx_workers), daemon=True)
         self.vae_dispatcher = threading.Thread(target=self._dispatch_jobs, args=(self.vae_job_queue, self.vae_workers), daemon=True)
         self.health_monitor = threading.Thread(target=self._health_check_loop, daemon=True)
     def _initialize_workers(self):
         """Cria e inicia os workers com base nas GPUs alocadas."""
+        ltx_gpus = [gpu_manager.get_ltx_device().index] # Assumindo que o getter retorna um device object
+        vae_gpus = [gpu_manager.get_ltx_vae_device().index]
         with self.pool_lock:
+            for i, device_id in enumerate(ltx_gpus):
                 worker = LTXMainWorker(worker_id=i, device=torch.device(f"cuda:{device_id}"))
                 self.ltx_workers.append(worker)
                 worker.start()
+            for i, device_id in enumerate(vae_gpus):
                 worker = VAEWorker(worker_id=i, device=torch.device(f"cuda:{device_id}"))
                 self.vae_workers.append(worker)
                 worker.start()
     def _get_available_worker(self, worker_pool: List[BaseWorker]) -> Optional[BaseWorker]:
         """Encontra um worker saudável e desocupado no pool."""
         with self.pool_lock:
+            # Simples estratégia round-robin para distribuir a carga
+            # Uma estratégia mais complexa poderia verificar a carga da GPU
             for worker in worker_pool:
                 healthy, busy = worker.get_status()
                 if healthy and not busy:
             while worker is None:
                 worker = self._get_available_worker(worker_pool)
                 if worker is None:
+                    time.sleep(0.1)
             try:
                 result = worker.execute(job_func, args, kwargs)
             with self.pool_lock:
                 for i, worker in enumerate(self.ltx_workers):
                     if not worker.is_alive() or not worker.is_healthy:
+                        logging.warning(f"LTX Worker {worker.worker_id} on {worker.device} is UNHEALTHY or dead. Restarting...")
                         new_worker = LTXMainWorker(worker.worker_id, worker.device)
                         self.ltx_workers[i] = new_worker
                         new_worker.start()
                 for i, worker in enumerate(self.vae_workers):
                     if not worker.is_alive() or not worker.is_healthy:
+                        logging.warning(f"VAE Worker {worker.worker_id} on {worker.device} is UNHEALTHY or dead. Restarting...")
                         new_worker = VAEWorker(worker.worker_id, worker.device)
                         self.vae_workers[i] = new_worker
                         new_worker.start()
     def submit_job(self, job_type: str, job_func: Callable, *args, **kwargs) -> Any:
+        """Ponto de entrada público para submeter um trabalho ao pool de forma síncrona."""
         if job_type not in ['ltx', 'vae']:
             raise ValueError("Invalid job_type. Must be 'ltx' or 'vae'.")
         job_queue = self.ltx_job_queue if job_type == 'ltx' else self.vae_job_queue
+        future = queue.Queue(1)
         job_queue.put((job_func, args, kwargs, future))
         result = future.get()
         if isinstance(result, Exception):
+            raise result
         return result