Spaces:
Running
on
Zero
Running
on
Zero
Add active process management for training in app.py
Browse filesImplement functionality to manage and stop active training processes on Ubuntu. Introduce a hard stop button in the UI that allows users to terminate ongoing training sessions. Enhance the run_training function to register active processes and handle termination signals gracefully, improving user control and experience during training operations.
app.py
CHANGED
|
@@ -13,6 +13,8 @@ import json
|
|
| 13 |
import gradio as gr
|
| 14 |
import importlib
|
| 15 |
import spaces
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# Local modules
|
| 18 |
from download_qwen_image_models import download_all_models, DEFAULT_MODELS_DIR
|
|
@@ -41,6 +43,11 @@ MODELS_ROOT_RUNTIME = DEFAULT_MODELS_ROOT
|
|
| 41 |
AUTO_DIR_RUNTIME = WORKSPACE_AUTO_DIR
|
| 42 |
DATA_ROOT_RUNTIME = DEFAULT_DATA_ROOT
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
def _bash_quote(s: str) -> str:
|
| 46 |
"""Return a POSIX-safe single-quoted string literal representing s."""
|
|
@@ -662,6 +669,32 @@ def run_training(
|
|
| 662 |
if not output_name.strip():
|
| 663 |
log_buf += "[ERROR] OUTPUT NAME is required.\n"
|
| 664 |
yield (log_buf, ckpts, artifacts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
return
|
| 666 |
if not caption.strip():
|
| 667 |
log_buf += "[ERROR] CAPTION is required.\n"
|
|
@@ -814,7 +847,16 @@ def run_training(
|
|
| 814 |
bufsize=1,
|
| 815 |
universal_newlines=True,
|
| 816 |
env=child_env,
|
|
|
|
| 817 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
try:
|
| 819 |
assert proc.stdout is not None
|
| 820 |
i = 0
|
|
@@ -830,6 +872,11 @@ def run_training(
|
|
| 830 |
yield (log_buf, ckpts, artifacts)
|
| 831 |
finally:
|
| 832 |
code = proc.wait()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 833 |
# Try to locate latest LoRA file for download
|
| 834 |
lora_path = None
|
| 835 |
try:
|
|
@@ -837,6 +884,12 @@ def run_training(
|
|
| 837 |
except Exception:
|
| 838 |
pass
|
| 839 |
lora_path = ckpts[0] if ckpts else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 840 |
log_buf += f"[QIE] Exit code: {code}\n"
|
| 841 |
# Final attempt to include metadata.jsonl
|
| 842 |
metadata_json = os.path.join(out_base, "metadata.jsonl")
|
|
@@ -1000,6 +1053,7 @@ def build_ui() -> gr.Blocks:
|
|
| 1000 |
ckpt_files = gr.Files(label="Checkpoints (live)", interactive=False)
|
| 1001 |
scripts_files = gr.Files(label="Scripts & Config (live)", interactive=False)
|
| 1002 |
with gr.Row():
|
|
|
|
| 1003 |
refresh_scripts_btn = gr.Button("ファイルを再取得", variant="secondary")
|
| 1004 |
|
| 1005 |
# moved max_epochs/save_every above next to OUTPUT NAME
|
|
@@ -1056,6 +1110,12 @@ def build_ui() -> gr.Blocks:
|
|
| 1056 |
outputs=[ckpt_files, scripts_files],
|
| 1057 |
)
|
| 1058 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1059 |
with gr.TabItem("Prompt Generator"):
|
| 1060 |
gr.Markdown("""
|
| 1061 |
# 🎨 A→B 変換プロンプト自動生成
|
|
|
|
| 13 |
import gradio as gr
|
| 14 |
import importlib
|
| 15 |
import spaces
|
| 16 |
+
import signal
|
| 17 |
+
from threading import Lock
|
| 18 |
|
| 19 |
# Local modules
|
| 20 |
from download_qwen_image_models import download_all_models, DEFAULT_MODELS_DIR
|
|
|
|
| 43 |
AUTO_DIR_RUNTIME = WORKSPACE_AUTO_DIR
|
| 44 |
DATA_ROOT_RUNTIME = DEFAULT_DATA_ROOT
|
| 45 |
|
| 46 |
+
# Active process management for hard stop (Ubuntu)
|
| 47 |
+
_ACTIVE_LOCK: Lock = Lock()
|
| 48 |
+
_ACTIVE_PROC: Optional[subprocess.Popen] = None
|
| 49 |
+
_ACTIVE_PGID: Optional[int] = None
|
| 50 |
+
|
| 51 |
|
| 52 |
def _bash_quote(s: str) -> str:
|
| 53 |
"""Return a POSIX-safe single-quoted string literal representing s."""
|
|
|
|
| 669 |
if not output_name.strip():
|
| 670 |
log_buf += "[ERROR] OUTPUT NAME is required.\n"
|
| 671 |
yield (log_buf, ckpts, artifacts)
|
| 672 |
+
|
| 673 |
+
|
| 674 |
+
def _stop_active_training() -> None:
|
| 675 |
+
"""Ubuntu向けのハード停止: 実行中の学習プロセスのプロセスグループを終了する"""
|
| 676 |
+
with _ACTIVE_LOCK:
|
| 677 |
+
proc = _ACTIVE_PROC
|
| 678 |
+
pgid = _ACTIVE_PGID
|
| 679 |
+
if not proc:
|
| 680 |
+
return
|
| 681 |
+
try:
|
| 682 |
+
if pgid is not None:
|
| 683 |
+
os.killpg(pgid, signal.SIGTERM)
|
| 684 |
+
else:
|
| 685 |
+
os.kill(proc.pid, signal.SIGTERM)
|
| 686 |
+
except Exception:
|
| 687 |
+
pass
|
| 688 |
+
try:
|
| 689 |
+
proc.wait(timeout=5)
|
| 690 |
+
except Exception:
|
| 691 |
+
try:
|
| 692 |
+
if pgid is not None:
|
| 693 |
+
os.killpg(pgid, signal.SIGKILL)
|
| 694 |
+
else:
|
| 695 |
+
os.kill(proc.pid, signal.SIGKILL)
|
| 696 |
+
except Exception:
|
| 697 |
+
pass
|
| 698 |
return
|
| 699 |
if not caption.strip():
|
| 700 |
log_buf += "[ERROR] CAPTION is required.\n"
|
|
|
|
| 847 |
bufsize=1,
|
| 848 |
universal_newlines=True,
|
| 849 |
env=child_env,
|
| 850 |
+
preexec_fn=os.setsid,
|
| 851 |
)
|
| 852 |
+
# Register active process for hard stop
|
| 853 |
+
with _ACTIVE_LOCK:
|
| 854 |
+
global _ACTIVE_PROC, _ACTIVE_PGID
|
| 855 |
+
_ACTIVE_PROC = proc
|
| 856 |
+
try:
|
| 857 |
+
_ACTIVE_PGID = os.getpgid(proc.pid)
|
| 858 |
+
except Exception:
|
| 859 |
+
_ACTIVE_PGID = None
|
| 860 |
try:
|
| 861 |
assert proc.stdout is not None
|
| 862 |
i = 0
|
|
|
|
| 872 |
yield (log_buf, ckpts, artifacts)
|
| 873 |
finally:
|
| 874 |
code = proc.wait()
|
| 875 |
+
# Clear active process registration if this proc
|
| 876 |
+
with _ACTIVE_LOCK:
|
| 877 |
+
if _ACTIVE_PROC is proc:
|
| 878 |
+
_ACTIVE_PROC = None
|
| 879 |
+
_ACTIVE_PGID = None
|
| 880 |
# Try to locate latest LoRA file for download
|
| 881 |
lora_path = None
|
| 882 |
try:
|
|
|
|
| 884 |
except Exception:
|
| 885 |
pass
|
| 886 |
lora_path = ckpts[0] if ckpts else None
|
| 887 |
+
if code < 0:
|
| 888 |
+
try:
|
| 889 |
+
sig = -code
|
| 890 |
+
log_buf += f"[QIE] Terminated by signal: {sig}\n"
|
| 891 |
+
except Exception:
|
| 892 |
+
log_buf += f"[QIE] Terminated by signal.\n"
|
| 893 |
log_buf += f"[QIE] Exit code: {code}\n"
|
| 894 |
# Final attempt to include metadata.jsonl
|
| 895 |
metadata_json = os.path.join(out_base, "metadata.jsonl")
|
|
|
|
| 1053 |
ckpt_files = gr.Files(label="Checkpoints (live)", interactive=False)
|
| 1054 |
scripts_files = gr.Files(label="Scripts & Config (live)", interactive=False)
|
| 1055 |
with gr.Row():
|
| 1056 |
+
stop_btn = gr.Button("学習を停止", variant="secondary")
|
| 1057 |
refresh_scripts_btn = gr.Button("ファイルを再取得", variant="secondary")
|
| 1058 |
|
| 1059 |
# moved max_epochs/save_every above next to OUTPUT NAME
|
|
|
|
| 1110 |
outputs=[ckpt_files, scripts_files],
|
| 1111 |
)
|
| 1112 |
|
| 1113 |
+
# Hard stop button (Ubuntu): kill active process group
|
| 1114 |
+
def _on_stop_click():
|
| 1115 |
+
_stop_active_training()
|
| 1116 |
+
return
|
| 1117 |
+
stop_btn.click(fn=_on_stop_click, inputs=[], outputs=[])
|
| 1118 |
+
|
| 1119 |
with gr.TabItem("Prompt Generator"):
|
| 1120 |
gr.Markdown("""
|
| 1121 |
# 🎨 A→B 変換プロンプト自動生成
|