Spaces:

yeq6x
/

QIE-LoRA-training-with-musubi-tuner

Running on Zero

yeq6x commited on 11 days ago

Commit

21d8059

1 Parent(s): 08d5f7b

Add active process management for training in app.py

Implement functionality to manage and stop active training processes on Ubuntu. Introduce a hard stop button in the UI that allows users to terminate ongoing training sessions. Enhance the run_training function to register active processes and handle termination signals gracefully, improving user control and experience during training operations.

Files changed (1) hide show

app.py +60 -0

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ import json
 import gradio as gr
 import importlib
 import spaces
 # Local modules
 from download_qwen_image_models import download_all_models, DEFAULT_MODELS_DIR
@@ -41,6 +43,11 @@ MODELS_ROOT_RUNTIME = DEFAULT_MODELS_ROOT
 AUTO_DIR_RUNTIME = WORKSPACE_AUTO_DIR
 DATA_ROOT_RUNTIME = DEFAULT_DATA_ROOT
 def _bash_quote(s: str) -> str:
     """Return a POSIX-safe single-quoted string literal representing s."""
@@ -662,6 +669,32 @@ def run_training(
     if not output_name.strip():
         log_buf += "[ERROR] OUTPUT NAME is required.\n"
         yield (log_buf, ckpts, artifacts)
         return
     if not caption.strip():
         log_buf += "[ERROR] CAPTION is required.\n"
@@ -814,7 +847,16 @@ def run_training(
         bufsize=1,
         universal_newlines=True,
         env=child_env,
     )
     try:
         assert proc.stdout is not None
         i = 0
@@ -830,6 +872,11 @@ def run_training(
             yield (log_buf, ckpts, artifacts)
     finally:
         code = proc.wait()
         # Try to locate latest LoRA file for download
         lora_path = None
         try:
@@ -837,6 +884,12 @@ def run_training(
         except Exception:
             pass
         lora_path = ckpts[0] if ckpts else None
         log_buf += f"[QIE] Exit code: {code}\n"
         # Final attempt to include metadata.jsonl
         metadata_json = os.path.join(out_base, "metadata.jsonl")
@@ -1000,6 +1053,7 @@ def build_ui() -> gr.Blocks:
                 ckpt_files = gr.Files(label="Checkpoints (live)", interactive=False)
                 scripts_files = gr.Files(label="Scripts & Config (live)", interactive=False)
                 with gr.Row():
                     refresh_scripts_btn = gr.Button("ファイルを再取得", variant="secondary")
                 # moved max_epochs/save_every above next to OUTPUT NAME
@@ -1056,6 +1110,12 @@ def build_ui() -> gr.Blocks:
                     outputs=[ckpt_files, scripts_files],
                 )
             with gr.TabItem("Prompt Generator"):
                 gr.Markdown("""
                 # 🎨 A→B 変換プロンプト自動生成

 import gradio as gr
 import importlib
 import spaces
+import signal
+from threading import Lock
 # Local modules
 from download_qwen_image_models import download_all_models, DEFAULT_MODELS_DIR
 AUTO_DIR_RUNTIME = WORKSPACE_AUTO_DIR
 DATA_ROOT_RUNTIME = DEFAULT_DATA_ROOT
+# Active process management for hard stop (Ubuntu)
+_ACTIVE_LOCK: Lock = Lock()
+_ACTIVE_PROC: Optional[subprocess.Popen] = None
+_ACTIVE_PGID: Optional[int] = None
 def _bash_quote(s: str) -> str:
     """Return a POSIX-safe single-quoted string literal representing s."""
     if not output_name.strip():
         log_buf += "[ERROR] OUTPUT NAME is required.\n"
         yield (log_buf, ckpts, artifacts)
+def _stop_active_training() -> None:
+    """Ubuntu向けのハード停止: 実行中の学習プロセスのプロセスグループを終了する"""
+    with _ACTIVE_LOCK:
+        proc = _ACTIVE_PROC
+        pgid = _ACTIVE_PGID
+    if not proc:
+        return
+    try:
+        if pgid is not None:
+            os.killpg(pgid, signal.SIGTERM)
+        else:
+            os.kill(proc.pid, signal.SIGTERM)
+    except Exception:
+        pass
+    try:
+        proc.wait(timeout=5)
+    except Exception:
+        try:
+            if pgid is not None:
+                os.killpg(pgid, signal.SIGKILL)
+            else:
+                os.kill(proc.pid, signal.SIGKILL)
+        except Exception:
+            pass
         return
     if not caption.strip():
         log_buf += "[ERROR] CAPTION is required.\n"
         bufsize=1,
         universal_newlines=True,
         env=child_env,
+        preexec_fn=os.setsid,
     )
+    # Register active process for hard stop
+    with _ACTIVE_LOCK:
+        global _ACTIVE_PROC, _ACTIVE_PGID
+        _ACTIVE_PROC = proc
+        try:
+            _ACTIVE_PGID = os.getpgid(proc.pid)
+        except Exception:
+            _ACTIVE_PGID = None
     try:
         assert proc.stdout is not None
         i = 0
             yield (log_buf, ckpts, artifacts)
     finally:
         code = proc.wait()
+        # Clear active process registration if this proc
+        with _ACTIVE_LOCK:
+            if _ACTIVE_PROC is proc:
+                _ACTIVE_PROC = None
+                _ACTIVE_PGID = None
         # Try to locate latest LoRA file for download
         lora_path = None
         try:
         except Exception:
             pass
         lora_path = ckpts[0] if ckpts else None
+        if code < 0:
+            try:
+                sig = -code
+                log_buf += f"[QIE] Terminated by signal: {sig}\n"
+            except Exception:
+                log_buf += f"[QIE] Terminated by signal.\n"
         log_buf += f"[QIE] Exit code: {code}\n"
         # Final attempt to include metadata.jsonl
         metadata_json = os.path.join(out_base, "metadata.jsonl")
                 ckpt_files = gr.Files(label="Checkpoints (live)", interactive=False)
                 scripts_files = gr.Files(label="Scripts & Config (live)", interactive=False)
                 with gr.Row():
+                    stop_btn = gr.Button("学習を停止", variant="secondary")
                     refresh_scripts_btn = gr.Button("ファイルを再取得", variant="secondary")
                 # moved max_epochs/save_every above next to OUTPUT NAME
                     outputs=[ckpt_files, scripts_files],
                 )
+                # Hard stop button (Ubuntu): kill active process group
+                def _on_stop_click():
+                    _stop_active_training()
+                    return
+                stop_btn.click(fn=_on_stop_click, inputs=[], outputs=[])
             with gr.TabItem("Prompt Generator"):
                 gr.Markdown("""
                 # 🎨 A→B 変換プロンプト自動生成