yeq6x commited on
Commit
21d8059
·
1 Parent(s): 08d5f7b

Add active process management for training in app.py

Browse files

Implement functionality to manage and stop active training processes on Ubuntu. Introduce a hard stop button in the UI that allows users to terminate ongoing training sessions. Enhance the run_training function to register active processes and handle termination signals gracefully, improving user control and experience during training operations.

Files changed (1) hide show
  1. app.py +60 -0
app.py CHANGED
@@ -13,6 +13,8 @@ import json
13
  import gradio as gr
14
  import importlib
15
  import spaces
 
 
16
 
17
  # Local modules
18
  from download_qwen_image_models import download_all_models, DEFAULT_MODELS_DIR
@@ -41,6 +43,11 @@ MODELS_ROOT_RUNTIME = DEFAULT_MODELS_ROOT
41
  AUTO_DIR_RUNTIME = WORKSPACE_AUTO_DIR
42
  DATA_ROOT_RUNTIME = DEFAULT_DATA_ROOT
43
 
 
 
 
 
 
44
 
45
  def _bash_quote(s: str) -> str:
46
  """Return a POSIX-safe single-quoted string literal representing s."""
@@ -662,6 +669,32 @@ def run_training(
662
  if not output_name.strip():
663
  log_buf += "[ERROR] OUTPUT NAME is required.\n"
664
  yield (log_buf, ckpts, artifacts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
  return
666
  if not caption.strip():
667
  log_buf += "[ERROR] CAPTION is required.\n"
@@ -814,7 +847,16 @@ def run_training(
814
  bufsize=1,
815
  universal_newlines=True,
816
  env=child_env,
 
817
  )
 
 
 
 
 
 
 
 
818
  try:
819
  assert proc.stdout is not None
820
  i = 0
@@ -830,6 +872,11 @@ def run_training(
830
  yield (log_buf, ckpts, artifacts)
831
  finally:
832
  code = proc.wait()
 
 
 
 
 
833
  # Try to locate latest LoRA file for download
834
  lora_path = None
835
  try:
@@ -837,6 +884,12 @@ def run_training(
837
  except Exception:
838
  pass
839
  lora_path = ckpts[0] if ckpts else None
 
 
 
 
 
 
840
  log_buf += f"[QIE] Exit code: {code}\n"
841
  # Final attempt to include metadata.jsonl
842
  metadata_json = os.path.join(out_base, "metadata.jsonl")
@@ -1000,6 +1053,7 @@ def build_ui() -> gr.Blocks:
1000
  ckpt_files = gr.Files(label="Checkpoints (live)", interactive=False)
1001
  scripts_files = gr.Files(label="Scripts & Config (live)", interactive=False)
1002
  with gr.Row():
 
1003
  refresh_scripts_btn = gr.Button("ファイルを再取得", variant="secondary")
1004
 
1005
  # moved max_epochs/save_every above next to OUTPUT NAME
@@ -1056,6 +1110,12 @@ def build_ui() -> gr.Blocks:
1056
  outputs=[ckpt_files, scripts_files],
1057
  )
1058
 
 
 
 
 
 
 
1059
  with gr.TabItem("Prompt Generator"):
1060
  gr.Markdown("""
1061
  # 🎨 A→B 変換プロンプト自動生成
 
13
  import gradio as gr
14
  import importlib
15
  import spaces
16
+ import signal
17
+ from threading import Lock
18
 
19
  # Local modules
20
  from download_qwen_image_models import download_all_models, DEFAULT_MODELS_DIR
 
43
  AUTO_DIR_RUNTIME = WORKSPACE_AUTO_DIR
44
  DATA_ROOT_RUNTIME = DEFAULT_DATA_ROOT
45
 
46
+ # Active process management for hard stop (Ubuntu)
47
+ _ACTIVE_LOCK: Lock = Lock()
48
+ _ACTIVE_PROC: Optional[subprocess.Popen] = None
49
+ _ACTIVE_PGID: Optional[int] = None
50
+
51
 
52
  def _bash_quote(s: str) -> str:
53
  """Return a POSIX-safe single-quoted string literal representing s."""
 
669
  if not output_name.strip():
670
  log_buf += "[ERROR] OUTPUT NAME is required.\n"
671
  yield (log_buf, ckpts, artifacts)
672
+
673
+
674
+ def _stop_active_training() -> None:
675
+ """Ubuntu向けのハード停止: 実行中の学習プロセスのプロセスグループを終了する"""
676
+ with _ACTIVE_LOCK:
677
+ proc = _ACTIVE_PROC
678
+ pgid = _ACTIVE_PGID
679
+ if not proc:
680
+ return
681
+ try:
682
+ if pgid is not None:
683
+ os.killpg(pgid, signal.SIGTERM)
684
+ else:
685
+ os.kill(proc.pid, signal.SIGTERM)
686
+ except Exception:
687
+ pass
688
+ try:
689
+ proc.wait(timeout=5)
690
+ except Exception:
691
+ try:
692
+ if pgid is not None:
693
+ os.killpg(pgid, signal.SIGKILL)
694
+ else:
695
+ os.kill(proc.pid, signal.SIGKILL)
696
+ except Exception:
697
+ pass
698
  return
699
  if not caption.strip():
700
  log_buf += "[ERROR] CAPTION is required.\n"
 
847
  bufsize=1,
848
  universal_newlines=True,
849
  env=child_env,
850
+ preexec_fn=os.setsid,
851
  )
852
+ # Register active process for hard stop
853
+ with _ACTIVE_LOCK:
854
+ global _ACTIVE_PROC, _ACTIVE_PGID
855
+ _ACTIVE_PROC = proc
856
+ try:
857
+ _ACTIVE_PGID = os.getpgid(proc.pid)
858
+ except Exception:
859
+ _ACTIVE_PGID = None
860
  try:
861
  assert proc.stdout is not None
862
  i = 0
 
872
  yield (log_buf, ckpts, artifacts)
873
  finally:
874
  code = proc.wait()
875
+ # Clear active process registration if this proc
876
+ with _ACTIVE_LOCK:
877
+ if _ACTIVE_PROC is proc:
878
+ _ACTIVE_PROC = None
879
+ _ACTIVE_PGID = None
880
  # Try to locate latest LoRA file for download
881
  lora_path = None
882
  try:
 
884
  except Exception:
885
  pass
886
  lora_path = ckpts[0] if ckpts else None
887
+ if code < 0:
888
+ try:
889
+ sig = -code
890
+ log_buf += f"[QIE] Terminated by signal: {sig}\n"
891
+ except Exception:
892
+ log_buf += f"[QIE] Terminated by signal.\n"
893
  log_buf += f"[QIE] Exit code: {code}\n"
894
  # Final attempt to include metadata.jsonl
895
  metadata_json = os.path.join(out_base, "metadata.jsonl")
 
1053
  ckpt_files = gr.Files(label="Checkpoints (live)", interactive=False)
1054
  scripts_files = gr.Files(label="Scripts & Config (live)", interactive=False)
1055
  with gr.Row():
1056
+ stop_btn = gr.Button("学習を停止", variant="secondary")
1057
  refresh_scripts_btn = gr.Button("ファイルを再取得", variant="secondary")
1058
 
1059
  # moved max_epochs/save_every above next to OUTPUT NAME
 
1110
  outputs=[ckpt_files, scripts_files],
1111
  )
1112
 
1113
+ # Hard stop button (Ubuntu): kill active process group
1114
+ def _on_stop_click():
1115
+ _stop_active_training()
1116
+ return
1117
+ stop_btn.click(fn=_on_stop_click, inputs=[], outputs=[])
1118
+
1119
  with gr.TabItem("Prompt Generator"):
1120
  gr.Markdown("""
1121
  # 🎨 A→B 変換プロンプト自動生成