Spaces:

yeq6x
/

QIE-LoRA-training-with-musubi-tuner

Running on Zero

App Files Files Community

yeq6x commited on 16 days ago

Commit

4cf412e

1 Parent(s): 325c528

Enhance run_training function in app.py to yield artifacts alongside checkpoints. Update error handling to include artifacts in log outputs, and add functionality to track and expose dataset configuration and script files for download. Modify UI to display scripts and configuration files, improving user experience and accessibility.

Browse files

Files changed (1) hide show

app.py +35 -10

app.py CHANGED Viewed

@@ -618,13 +618,14 @@ def run_training(
     # Basic validation
     log_buf = ""
     ckpts: List[str] = []
     if not output_name.strip():
         log_buf += "[ERROR] OUTPUT NAME is required.\n"
-        yield (log_buf, ckpts)
         return
     if not caption.strip():
         log_buf += "[ERROR] CAPTION is required.\n"
-        yield (log_buf, ckpts)
         return
     # Ensure /auto holds helper files expected by the script
@@ -644,11 +645,11 @@ def run_training(
     base_files = _extract_paths(image_uploads)
     if not base_files:
         log_buf += "[ERROR] No images uploaded for IMAGE_FOLDER.\n"
-        yield (log_buf, ckpts)
         return
     base_filenames = _copy_uploads(base_files, img_dir)
     log_buf += f"[QIE] Copied {len(base_filenames)} base images to {img_dir}\n"
-    yield (log_buf, ckpts)
     # Prepare control sets
     control_upload_sets = [
@@ -664,7 +665,7 @@ def run_training(
     # Require control_0; others optional
     if not control_upload_sets[0]:
         log_buf += "[ERROR] control_0 images are required.\n"
-        yield (log_buf, ckpts)
         return
     control_dirs: List[Optional[str]] = []
@@ -679,7 +680,7 @@ def run_training(
         _copy_uploads(uploads, cdir)
         control_dirs.append(folder_name)
         log_buf += f"[QIE] Copied {len(uploads)} control_{i} images to {cdir}\n"
-        yield (log_buf, ckpts)
     # Metadata.jsonl will be generated by create_image_caption_json.py in train_QIE.sh
@@ -705,6 +706,9 @@ def run_training(
         log_buf += f"[QIE] Updated dataset config: resolution=({train_res_w},{train_res_h}), batch_size={train_batch_size}, control_res=({control_res_w},{control_res_h})\n"
     except Exception as e:
         log_buf += f"[QIE] WARN: failed to update dataset config: {e}\n"
     # Resolve models_root and set output_dir_base to the unique dataset dir
     models_root = MODELS_ROOT_RUNTIME
@@ -742,7 +746,19 @@ def run_training(
     log_buf += f"[QIE] Running script: {tmp_script}\n"
     out_dir = os.path.join(out_base, output_name.strip())
     ckpts = _list_checkpoints(out_dir)
-    yield (log_buf, ckpts)
     # Run and stream output
     # Ensure child Python processes are unbuffered for real-time logs
@@ -767,7 +783,11 @@ def run_training(
             i += 1
             if i % 30 == 0:
                 ckpts = _list_checkpoints(out_dir)
-            yield (log_buf, ckpts)
     finally:
         code = proc.wait()
         # Try to locate latest LoRA file for download
@@ -778,7 +798,11 @@ def run_training(
             pass
         lora_path = ckpts[0] if ckpts else None
         log_buf += f"[QIE] Exit code: {code}\n"
-        yield (log_buf, ckpts)
 def build_ui() -> gr.Blocks:
@@ -934,6 +958,7 @@ def build_ui() -> gr.Blocks:
                 run_btn = gr.Button("Start Training", variant="primary")
                 logs = gr.Textbox(label="Logs", lines=20)
                 ckpt_files = gr.Files(label="Checkpoints (live)", interactive=False)
                 # moved max_epochs/save_every above next to OUTPUT NAME
@@ -964,7 +989,7 @@ def build_ui() -> gr.Blocks:
                         tr_w, tr_h, train_bs, cr_w, cr_h, te_bs,
                         seed_input, max_epochs, save_every,
                     ],
-                    outputs=[logs, ckpt_files],
                 )
             with gr.TabItem("Prompt Generator"):

     # Basic validation
     log_buf = ""
     ckpts: List[str] = []
+    artifacts: List[str] = []
     if not output_name.strip():
         log_buf += "[ERROR] OUTPUT NAME is required.\n"
+        yield (log_buf, ckpts, artifacts)
         return
     if not caption.strip():
         log_buf += "[ERROR] CAPTION is required.\n"
+        yield (log_buf, ckpts, artifacts)
         return
     # Ensure /auto holds helper files expected by the script
     base_files = _extract_paths(image_uploads)
     if not base_files:
         log_buf += "[ERROR] No images uploaded for IMAGE_FOLDER.\n"
+        yield (log_buf, ckpts, artifacts)
         return
     base_filenames = _copy_uploads(base_files, img_dir)
     log_buf += f"[QIE] Copied {len(base_filenames)} base images to {img_dir}\n"
+    yield (log_buf, ckpts, artifacts)
     # Prepare control sets
     control_upload_sets = [
     # Require control_0; others optional
     if not control_upload_sets[0]:
         log_buf += "[ERROR] control_0 images are required.\n"
+        yield (log_buf, ckpts, artifacts)
         return
     control_dirs: List[Optional[str]] = []
         _copy_uploads(uploads, cdir)
         control_dirs.append(folder_name)
         log_buf += f"[QIE] Copied {len(uploads)} control_{i} images to {cdir}\n"
+        yield (log_buf, ckpts, artifacts)
     # Metadata.jsonl will be generated by create_image_caption_json.py in train_QIE.sh
         log_buf += f"[QIE] Updated dataset config: resolution=({train_res_w},{train_res_h}), batch_size={train_batch_size}, control_res=({control_res_w},{control_res_h})\n"
     except Exception as e:
         log_buf += f"[QIE] WARN: failed to update dataset config: {e}\n"
+    # Expose dataset config for download (if exists)
+    if os.path.isfile(ds_conf):
+        artifacts = [ds_conf]
     # Resolve models_root and set output_dir_base to the unique dataset dir
     models_root = MODELS_ROOT_RUNTIME
     log_buf += f"[QIE] Running script: {tmp_script}\n"
     out_dir = os.path.join(out_base, output_name.strip())
     ckpts = _list_checkpoints(out_dir)
+    # Copy the final script to dataset dir for download
+    used_script_path = os.path.join(out_base, "train_QIE_used.sh")
+    try:
+        shutil.copy2(str(tmp_script), used_script_path)
+        try:
+            os.chmod(used_script_path, 0o755)
+        except Exception:
+            pass
+        if used_script_path not in artifacts:
+            artifacts.append(used_script_path)
+    except Exception:
+        pass
+    yield (log_buf, ckpts, artifacts)
     # Run and stream output
     # Ensure child Python processes are unbuffered for real-time logs
             i += 1
             if i % 30 == 0:
                 ckpts = _list_checkpoints(out_dir)
+                # Try to add metadata.jsonl once available
+                metadata_json = os.path.join(out_base, "metadata.jsonl")
+                if os.path.isfile(metadata_json) and metadata_json not in artifacts:
+                    artifacts.append(metadata_json)
+            yield (log_buf, ckpts, artifacts)
     finally:
         code = proc.wait()
         # Try to locate latest LoRA file for download
             pass
         lora_path = ckpts[0] if ckpts else None
         log_buf += f"[QIE] Exit code: {code}\n"
+        # Final attempt to include metadata.jsonl
+        metadata_json = os.path.join(out_base, "metadata.jsonl")
+        if os.path.isfile(metadata_json) and metadata_json not in artifacts:
+            artifacts.append(metadata_json)
+        yield (log_buf, ckpts, artifacts)
 def build_ui() -> gr.Blocks:
                 run_btn = gr.Button("Start Training", variant="primary")
                 logs = gr.Textbox(label="Logs", lines=20)
                 ckpt_files = gr.Files(label="Checkpoints (live)", interactive=False)
+                scripts_files = gr.Files(label="Scripts & Config (live)", interactive=False)
                 # moved max_epochs/save_every above next to OUTPUT NAME
                         tr_w, tr_h, train_bs, cr_w, cr_h, te_bs,
                         seed_input, max_epochs, save_every,
                     ],
+                    outputs=[logs, ckpt_files, scripts_files],
                 )
             with gr.TabItem("Prompt Generator"):