Spaces:

ameerazam08
/

Stable-Cascade-Super-Resolution

Paused

App Files Files Community

ameerazam08 commited on Feb 21

Commit

6a6edcb

•

1 Parent(s): 5d25eca

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
.gitignore +12 -0
LICENSE +21 -0
README.md +4 -4
WEIGHTS_LICENSE +44 -0
__init__.py +0 -0
app.py +35 -0
configs/inference/controlnet_c_3b_canny.yaml +14 -0
configs/inference/controlnet_c_3b_identity.yaml +17 -0
configs/inference/controlnet_c_3b_inpainting.yaml +15 -0
configs/inference/controlnet_c_3b_sr.yaml +15 -0
configs/inference/lora_c_3b.yaml +15 -0
configs/inference/stage_b_3b.yaml +13 -0
configs/inference/stage_c_3b.yaml +7 -0
configs/training/controlnet_c_3b_canny.yaml +45 -0
configs/training/controlnet_c_3b_identity.yaml +48 -0
configs/training/controlnet_c_3b_inpainting.yaml +46 -0
configs/training/controlnet_c_3b_sr.yaml +46 -0
configs/training/finetune_b_3b.yaml +36 -0
configs/training/finetune_b_700m.yaml +36 -0
configs/training/finetune_c_1b.yaml +35 -0
configs/training/finetune_c_3b.yaml +35 -0
configs/training/finetune_c_3b_lora.yaml +44 -0
configs/training/finetune_c_3b_lowres.yaml +41 -0
configs/training/finetune_c_3b_v.yaml +36 -0
core/__init__.py +371 -0
core/data/__init__.py +69 -0
core/data/bucketeer.py +72 -0
core/scripts/__init__.py +0 -0
core/scripts/cli.py +41 -0
core/templates/__init__.py +1 -0
core/templates/diffusion.py +236 -0
core/utils/__init__.py +9 -0
core/utils/base_dto.py +56 -0
core/utils/save_and_load.py +59 -0
figures/collage_1.jpg +3 -0
figures/collage_2.jpg +0 -0
figures/collage_3.jpg +3 -0
figures/collage_4.jpg +0 -0
figures/comparison-inference-speed.jpg +0 -0
figures/comparison.png +0 -0
figures/controlnet-canny.jpg +0 -0
figures/controlnet-face.jpg +0 -0
figures/controlnet-paint.jpg +0 -0
figures/controlnet-sr.jpg +3 -0
figures/fernando.jpg +0 -0
figures/fernando_original.jpg +0 -0
figures/image-to-image-example-rodent.jpg +0 -0
figures/image-variations-example-headset.jpg +0 -0
figures/model-overview.jpg +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+figures/collage_1.jpg filter=lfs diff=lfs merge=lfs -text
+figures/collage_3.jpg filter=lfs diff=lfs merge=lfs -text
+figures/controlnet-sr.jpg filter=lfs diff=lfs merge=lfs -text
+inference/controlnet.ipynb filter=lfs diff=lfs merge=lfs -text
+inference/reconstruct_images.ipynb filter=lfs diff=lfs merge=lfs -text
+inference/text_to_image.ipynb filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+*.yml
+*.out
+dist_file_*
+__pycache__/*
+*/__pycache__/*
+*/**/__pycache__/*
+*_latest_output.jpg
+*_sample.jpg
+jobs/*.sh
+.ipynb_checkpoints
+*.safetensors
+*_test.yaml

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Stability AI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Stable Cascade SR
-emoji: 🌖
-colorFrom: red
-colorTo: red
 sdk: gradio
 sdk_version: 4.19.1
 app_file: app.py

 ---
+title: Stable Cascade Upscale
+emoji: 🏃
+colorFrom: pink
+colorTo: gray
 sdk: gradio
 sdk_version: 4.19.1
 app_file: app.py

WEIGHTS_LICENSE ADDED Viewed

	@@ -0,0 +1,44 @@

+## THIS LICENSE IS FOR THE MODEL WEIGHTS ONLY
+STABILITY AI NON-COMMERCIAL RESEARCH COMMUNITY LICENSE AGREEMENT
+Dated: November 28, 2023
+By using or distributing any portion or element of the Models, Software, Software Products or Derivative Works, you agree to be bound by this Agreement.
+"Agreement" means this Stable Non-Commercial Research Community License Agreement.
+“AUP” means the Stability AI Acceptable Use Policy available at https://stability.ai/use-policy, as may be updated from time to time.
+"Derivative Work(s)” means (a) any derivative work of the Software Products as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model’s output. For clarity, Derivative Works do not include the output of any Model.
+“Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software.
+"Licensee" or "you" means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity's behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+“Model(s)" means, collectively, Stability AI’s proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing, made available under this Agreement.
+“Non-Commercial Uses” means exercising any of the rights granted herein for the purpose of research or non-commercial purposes. Non-Commercial Uses does not include any production use of the Software Products or any Derivative Works.
+"Stability AI" or "we" means Stability AI Ltd. and its affiliates.
+"Software" means Stability AI’s proprietary software made available under this Agreement.
+“Software Products” means the Models, Software and Documentation, individually or in any combination.
+1. 	License Rights and Redistribution.
+a.  	Subject to your compliance with this Agreement, the AUP (which is hereby incorporated herein by reference), and the Documentation, Stability AI grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Stability AI’s intellectual property or other rights owned or controlled by Stability AI embodied in the Software Products to use, reproduce, distribute, and create Derivative Works of, the Software Products, in each case for Non-Commercial Uses only.
+b.   You may not use the Software Products or Derivative Works to enable third parties to use the Software Products or Derivative Works as part of your hosted service or via your APIs, whether you are adding substantial additional functionality thereto or not. Merely distributing the Software Products or Derivative Works for download online without offering any related service (ex. by distributing the Models on HuggingFace) is not a violation of this subsection. If you wish to use the Software Products or any Derivative Works for commercial or production use or you wish to make the Software Products or any Derivative Works available to third parties via your hosted service or your APIs, contact Stability AI at https://stability.ai/contact.
+c.	If you distribute or make the Software Products, or any Derivative Works thereof, available to a third party, the Software Products, Derivative Works, or any portion thereof, respectively, will remain subject to this Agreement and you must (i) provide a copy of this Agreement to such third party, and (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This Stability AI Model is licensed under the Stability AI Non-Commercial Research Community License, Copyright (c) Stability AI Ltd. All Rights Reserved.” If you create a Derivative Work of a Software Product, you may add your own attribution notices to the Notice file included with the Software Product, provided that you clearly indicate which attributions apply to the Software Product and you must state in the NOTICE file that you changed the Software Product and how it was modified.
+2.	Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE SOFTWARE PRODUCTS  AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE SOFTWARE PRODUCTS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE SOFTWARE PRODUCTS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS.
+3.	Limitation of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+4.   	Intellectual Property.
+a. 	No trademark licenses are granted under this Agreement, and in connection with the Software Products or Derivative Works, neither Stability AI nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Software Products or Derivative Works.
+b.	Subject to Stability AI’s ownership of the Software Products and Derivative Works made by or for Stability AI, with respect to any Derivative Works that are made by you, as between you and Stability AI, you are and will be the owner of such Derivative Works
+c. 	If you institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Software Products, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to your use or distribution of the Software Products or Derivative Works in violation of this Agreement.
+5. 	Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Software Products and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of any Software Products or Derivative Works. Sections 2-4 shall survive the termination of this Agreement.
+6.	Governing Law. This Agreement will be governed by and construed in accordance with the laws of the United States and the State of California without regard to choice of law
+principles.

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import gradio as gr
+from PIL import Image
+from main import Upscale_CaseCade
+import spaces
+upscale_class=Upscale_CaseCade()
+# scale_fator=7
+# url = "https://cdn.discordapp.com/attachments/1121232062708457508/1205110687538479145/A_photograph_of_a_sunflower_with_sunglasses_on_in__3.jpg?ex=65d72dc9&is=65c4b8c9&hm=72172e774ce6cda618503b3778b844de05cd1208b61e185d8418db512fb2858a&"
+# image_pil=Image.open("/home/rnd/Documents/Ameer/StableCascade/poster.png").convert("RGB")
+@spaces.GPU
+def scale_image(image_pil,scale_factor):
+    og,ups=upscale_class.upscale_image(image_pil=image_pil.convert("RGB"),scale_fator=scale_factor)
+    return [ups]
+DESCRIPTION = "# Stable Cascade -> Super Resolution"
+DESCRIPTION += "\n<p style=\"text-align: center\">Unofficial demo for Cascade-Super Resolution  <a href='https://huggingface.co/stabilityai/stable-cascade' target='_blank'>Stable Upscale Cascade</a>, a new high resolution image-to-image model by Stability AI, - <a href='https://huggingface.co/stabilityai/stable-cascade/blob/main/LICENSE' target='_blank'>non-commercial research license</a></p>"
+# block = gr.Blocks(css="footer {visibility: hidden}", theme='freddyaboulton/dracula_revamped').queue()
+block = gr.Blocks(css="footer {visibility: hidden}", theme='freddyaboulton/dark').queue()
+with block:
+    with gr.Row():
+        gr.Markdown(DESCRIPTION)
+    with gr.Tabs():
+        with gr.Row():
+            with gr.Column():
+                image_pil = gr.Image(label="Describe the Image", type='pil')
+                scale_factor = gr.Slider(minimum=1,maximum=10,value=1, step=1, label="Scale Factor")
+                generate_button = gr.Button("Upscale Image")
+            with gr.Column():
+                generated_image = gr.Gallery(label="Generated Image",)
+        generate_button.click(fn=scale_image, inputs=[image_pil,scale_factor], outputs=[generated_image])
+block.launch(show_api=False, server_port=8888, share=False, show_error=True, max_threads=1)
+# pip install gradio==4.16.0 gradio_client==0.8.1

configs/inference/controlnet_c_3b_canny.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+# GLOBAL STUFF
+model_version: 3.6B
+dtype: bfloat16
+# ControlNet specific
+controlnet_blocks: [0, 4, 8, 12, 51, 55, 59, 63]
+controlnet_filter: CannyFilter
+controlnet_filter_params:
+  resize: 224
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors
+controlnet_checkpoint_path: models/canny.safetensors

configs/inference/controlnet_c_3b_identity.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+# GLOBAL STUFF
+model_version: 3.6B
+dtype: bfloat16
+# ControlNet specific
+controlnet_bottleneck_mode: 'simple'
+controlnet_blocks: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]
+controlnet_filter: IdentityFilter
+controlnet_filter_params:
+  max_faces: 4
+  p_drop: 0.00
+  p_full: 0.0
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors
+controlnet_checkpoint_path:

configs/inference/controlnet_c_3b_inpainting.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# GLOBAL STUFF
+model_version: 3.6B
+dtype: bfloat16
+# ControlNet specific
+controlnet_blocks: [0, 4, 8, 12, 51, 55, 59, 63]
+controlnet_filter: InpaintFilter
+controlnet_filter_params:
+  thresold: [0.04, 0.4]
+  p_outpaint: 0.4
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors
+controlnet_checkpoint_path: models/inpainting.safetensors

configs/inference/controlnet_c_3b_sr.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# GLOBAL STUFF
+model_version: 3.6B
+dtype: bfloat16
+# ControlNet specific
+controlnet_bottleneck_mode: 'large'
+controlnet_blocks: [0, 4, 8, 12, 51, 55, 59, 63]
+controlnet_filter: SREffnetFilter
+controlnet_filter_params:
+  scale_factor: 0.5
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors
+controlnet_checkpoint_path: models/super_resolution.safetensors

configs/inference/lora_c_3b.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# GLOBAL STUFF
+model_version: 3.6B
+dtype: bfloat16
+# LoRA specific
+module_filters: ['.attn']
+rank: 4
+train_tokens:
+  # - ['^snail', null] # token starts with "snail" -> "snail" & "snails", don't need to be reinitialized
+  - ['[fernando]', '^dog</w>'] # custom token [snail], initialize as avg of snail & snails
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors
+lora_checkpoint_path: models/lora_fernando_10k.safetensors

configs/inference/stage_b_3b.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+# GLOBAL STUFF
+model_version: 3B
+dtype: bfloat16
+# For demonstration purposes in reconstruct_images.ipynb
+webdataset_path: file:inference/imagenet_1024.tar
+batch_size: 4
+image_size: 1024
+grad_accum_steps: 1
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+stage_a_checkpoint_path: models/stage_a.safetensors
+generator_checkpoint_path: models/stage_b_bf16.safetensors

configs/inference/stage_c_3b.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# GLOBAL STUFF
+model_version: 3.6B
+dtype: bfloat16
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors

configs/training/controlnet_c_3b_canny.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+# GLOBAL STUFF
+experiment_id: stage_c_3b_controlnet_canny
+checkpoint_path: /path/to/checkpoint
+output_path: /path/to/output
+model_version: 3.6B
+# WandB
+wandb_project: StableCascade
+wandb_entity: wandb_username
+# TRAINING PARAMS
+lr: 1.0e-4
+batch_size: 256
+image_size: 768
+# multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16]
+grad_accum_steps: 1
+updates: 10000
+backup_every: 2000
+save_every: 1000
+warmup_updates: 1
+use_fsdp: True
+# ControlNet specific
+controlnet_blocks: [0, 4, 8, 12, 51, 55, 59, 63]
+controlnet_filter: CannyFilter
+controlnet_filter_params:
+  resize: 224
+# offset_noise: 0.1
+# CUSTOM CAPTIONS GETTER & FILTERS
+captions_getter: ['txt', identity]
+dataset_filters:
+  - ['width', 'lambda w: w >= 768']
+  - ['height', 'lambda h: h >= 768']
+# ema_start_iters: 5000
+# ema_iters: 100
+# ema_beta: 0.9
+webdataset_path:
+  - s3://path/to/your/first/dataset/on/s3
+  - s3://path/to/your/second/dataset/on/s3
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors

configs/training/controlnet_c_3b_identity.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# GLOBAL STUFF
+experiment_id: stage_c_3b_controlnet_identity
+checkpoint_path: /path/to/checkpoint
+output_path: /path/to/output
+model_version: 3.6B
+# WandB
+wandb_project: StableCascade
+wandb_entity: wandb_username
+# TRAINING PARAMS
+lr: 1.0e-4
+batch_size: 256
+image_size: 768
+# multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16]
+grad_accum_steps: 1
+updates: 200000
+backup_every: 2000
+save_every: 1000
+warmup_updates: 1
+use_fsdp: True
+# ControlNet specific
+controlnet_bottleneck_mode: 'simple'
+controlnet_blocks: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]
+controlnet_filter: IdentityFilter
+controlnet_filter_params:
+  max_faces: 4
+  p_drop: 0.05
+  p_full: 0.3
+# offset_noise: 0.1
+# CUSTOM CAPTIONS GETTER & FILTERS
+captions_getter: ['txt', identity]
+dataset_filters:
+  - ['width', 'lambda w: w >= 768']
+  - ['height', 'lambda h: h >= 768']
+# ema_start_iters: 5000
+# ema_iters: 100
+# ema_beta: 0.9
+webdataset_path:
+  - s3://path/to/your/first/dataset/on/s3
+  - s3://path/to/your/second/dataset/on/s3
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors

configs/training/controlnet_c_3b_inpainting.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+# GLOBAL STUFF
+experiment_id: stage_c_3b_controlnet_inpainting
+checkpoint_path: /path/to/checkpoint
+output_path: /path/to/output
+model_version: 3.6B
+# WandB
+wandb_project: StableCascade
+wandb_entity: wandb_username
+# TRAINING PARAMS
+lr: 1.0e-4
+batch_size: 256
+image_size: 768
+# multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16]
+grad_accum_steps: 1
+updates: 10000
+backup_every: 2000
+save_every: 1000
+warmup_updates: 1
+use_fsdp: True
+# ControlNet specific
+controlnet_blocks: [0, 4, 8, 12, 51, 55, 59, 63]
+controlnet_filter: InpaintFilter
+controlnet_filter_params:
+  thresold: [0.04, 0.4]
+  p_outpaint: 0.4
+offset_noise: 0.1
+# CUSTOM CAPTIONS GETTER & FILTERS
+captions_getter: ['txt', identity]
+dataset_filters:
+  - ['width', 'lambda w: w >= 768']
+  - ['height', 'lambda h: h >= 768']
+# ema_start_iters: 5000
+# ema_iters: 100
+# ema_beta: 0.9
+webdataset_path:
+  - s3://path/to/your/first/dataset/on/s3
+  - s3://path/to/your/second/dataset/on/s3
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors

configs/training/controlnet_c_3b_sr.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+# GLOBAL STUFF
+experiment_id: stage_c_3b_controlnet_sr
+checkpoint_path: /path/to/checkpoint
+output_path: /path/to/output
+model_version: 3.6B
+# WandB
+wandb_project: StableCascade
+wandb_entity: wandb_username
+# TRAINING PARAMS
+lr: 1.0e-4
+batch_size: 256
+image_size: 768
+# multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16]
+grad_accum_steps: 1
+updates: 30000
+backup_every: 5000
+save_every: 1000
+warmup_updates: 1
+use_fsdp: True
+# ControlNet specific
+controlnet_bottleneck_mode: 'large'
+controlnet_blocks: [0, 4, 8, 12, 51, 55, 59, 63]
+controlnet_filter: SREffnetFilter
+controlnet_filter_params:
+  scale_factor: 0.5
+offset_noise: 0.1
+# CUSTOM CAPTIONS GETTER & FILTERS
+captions_getter: ['txt', identity]
+dataset_filters:
+  - ['width', 'lambda w: w >= 768']
+  - ['height', 'lambda h: h >= 768']
+# ema_start_iters: 5000
+# ema_iters: 100
+# ema_beta: 0.9
+webdataset_path:
+  - s3://path/to/your/first/dataset/on/s3
+  - s3://path/to/your/second/dataset/on/s3
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors

configs/training/finetune_b_3b.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# GLOBAL STUFF
+experiment_id: stage_b_3b_finetuning
+checkpoint_path: /path/to/checkpoint
+output_path: /path/to/output
+model_version: 3B
+# WandB
+wandb_project: StableCascade
+wandb_entity: wandb_username
+# TRAINING PARAMS
+lr: 1.0e-4
+batch_size: 256
+image_size: 1024
+# multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16]
+shift: 4
+grad_accum_steps: 1
+updates: 100000
+backup_every: 20000
+save_every: 1000
+warmup_updates: 1
+use_fsdp: True
+# GDF
+adaptive_loss_weight: True
+# ema_start_iters: 5000
+# ema_iters: 100
+# ema_beta: 0.9
+webdataset_path:
+  - s3://path/to/your/first/dataset/on/s3
+  - s3://path/to/your/second/dataset/on/s3
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+stage_a_checkpoint_path: models/stage_a.safetensors
+generator_checkpoint_path: models/stage_b_bf16.safetensors

configs/training/finetune_b_700m.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# GLOBAL STUFF
+experiment_id: stage_b_700m_finetuning
+checkpoint_path: /path/to/checkpoint
+output_path: /path/to/output
+model_version: 700M
+# WandB
+wandb_project: StableCascade
+wandb_entity: wandb_username
+# TRAINING PARAMS
+lr: 1.0e-4
+batch_size: 512
+image_size: 1024
+# multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16]
+shift: 4
+grad_accum_steps: 1
+updates: 10000
+backup_every: 20000
+save_every: 2000
+warmup_updates: 1
+use_fsdp: True
+# GDF
+adaptive_loss_weight: True
+# ema_start_iters: 5000
+# ema_iters: 100
+# ema_beta: 0.9
+webdataset_path:
+  - s3://path/to/your/first/dataset/on/s3
+  - s3://path/to/your/second/dataset/on/s3
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+stage_a_checkpoint_path: models/stage_a.safetensors
+generator_checkpoint_path: models/stage_b_lite_bf16.safetensors

configs/training/finetune_c_1b.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# GLOBAL STUFF
+experiment_id: stage_c_1b_finetuning
+checkpoint_path: /path/to/checkpoint
+output_path: /path/to/output
+model_version: 1B
+# WandB
+wandb_project: StableCascade
+wandb_entity: wandb_username
+# TRAINING PARAMS
+lr: 1.0e-4
+batch_size: 1024
+image_size: 768
+# multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16]
+grad_accum_steps: 1
+updates: 10000
+backup_every: 20000
+save_every: 2000
+warmup_updates: 1
+use_fsdp: True
+# GDF
+# adaptive_loss_weight: True
+# ema_start_iters: 5000
+# ema_iters: 100
+# ema_beta: 0.9
+webdataset_path:
+  - s3://path/to/your/first/dataset/on/s3
+  - s3://path/to/your/second/dataset/on/s3
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_lite_bf16.safetensors

configs/training/finetune_c_3b.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# GLOBAL STUFF
+experiment_id: stage_c_3b_finetuning
+checkpoint_path: /path/to/checkpoint
+output_path: /path/to/output
+model_version: 3.6B
+# WandB
+wandb_project: StableCascade
+wandb_entity: wandb_username
+# TRAINING PARAMS
+lr: 1.0e-4
+batch_size: 512
+image_size: 768
+multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16]
+grad_accum_steps: 1
+updates: 100000
+backup_every: 20000
+save_every: 2000
+warmup_updates: 1
+use_fsdp: True
+# GDF
+adaptive_loss_weight: True
+# ema_start_iters: 5000
+# ema_iters: 100
+# ema_beta: 0.9
+webdataset_path:
+  - s3://path/to/your/first/dataset/on/s3
+  - s3://path/to/your/second/dataset/on/s3
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors

configs/training/finetune_c_3b_lora.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+# GLOBAL STUFF
+experiment_id: stage_c_3b_lora
+checkpoint_path: /path/to/checkpoint
+output_path: /path/to/output
+model_version: 3.6B
+# WandB
+wandb_project: StableCascade
+wandb_entity: wandb_username
+# TRAINING PARAMS
+lr: 1.0e-4
+batch_size: 32
+image_size: 768
+multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16]
+grad_accum_steps: 4
+updates: 10000
+backup_every: 1000
+save_every: 100
+warmup_updates: 1
+# use_fsdp: True -> FSDP doesn't work at the moment for LoRA
+use_fsdp: False
+# GDF
+# adaptive_loss_weight: True
+# LoRA specific
+module_filters: ['.attn']
+rank: 4
+train_tokens:
+  # - ['^snail', null] # token starts with "snail" -> "snail" & "snails", don't need to be reinitialized
+  - ['[fernando]', '^dog</w>'] # custom token [snail], initialize as avg of snail & snails
+# ema_start_iters: 5000
+# ema_iters: 100
+# ema_beta: 0.9
+webdataset_path:
+  - s3://path/to/your/first/dataset/on/s3
+  - s3://path/to/your/second/dataset/on/s3
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors

configs/training/finetune_c_3b_lowres.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# GLOBAL STUFF
+experiment_id: stage_c_3b_finetuning
+checkpoint_path: /path/to/checkpoint
+output_path: /path/to/output
+model_version: 3.6B
+# WandB
+wandb_project: StableCascade
+wandb_entity: wandb_username
+# TRAINING PARAMS
+lr: 1.0e-4
+batch_size: 1024
+image_size: 384
+multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16]
+grad_accum_steps: 1
+updates: 100000
+backup_every: 20000
+save_every: 2000
+warmup_updates: 1
+use_fsdp: True
+# GDF
+adaptive_loss_weight: True
+# CUSTOM CAPTIONS GETTER & FILTERS
+# captions_getter: ['json', captions_getter]
+# dataset_filters:
+#   - ['normalized_score', 'lambda s: s > 9.0']
+#   - ['pgen_normalized_score', 'lambda s: s > 3.0']
+# ema_start_iters: 5000
+# ema_iters: 100
+# ema_beta: 0.9
+webdataset_path:
+  - s3://path/to/your/first/dataset/on/s3
+  - s3://path/to/your/second/dataset/on/s3
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors

configs/training/finetune_c_3b_v.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# GLOBAL STUFF
+experiment_id: stage_c_3b_finetuning
+checkpoint_path: /path/to/checkpoint
+output_path: /path/to/output
+model_version: 3.6B
+# WandB
+wandb_project: StableCascade
+wandb_entity: wandb_username
+# TRAINING PARAMS
+lr: 1.0e-4
+batch_size: 512
+image_size: 768
+multi_aspect_ratio: [1/1, 1/2, 1/3, 2/3, 3/4, 1/5, 2/5, 3/5, 4/5, 1/6, 5/6, 9/16]
+grad_accum_steps: 1
+updates: 100000
+backup_every: 20000
+save_every: 2000
+warmup_updates: 1
+use_fsdp: True
+# GDF
+adaptive_loss_weight: True
+edm_objective: True
+# ema_start_iters: 5000
+# ema_iters: 100
+# ema_beta: 0.9
+webdataset_path:
+  - s3://path/to/your/first/dataset/on/s3
+  - s3://path/to/your/second/dataset/on/s3
+effnet_checkpoint_path: models/effnet_encoder.safetensors
+previewer_checkpoint_path: models/previewer.safetensors
+generator_checkpoint_path: models/stage_c_bf16.safetensors

core/__init__.py ADDED Viewed

	@@ -0,0 +1,371 @@

+import os
+import yaml
+import torch
+from torch import nn
+import wandb
+import json
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from torch.utils.data import Dataset, DataLoader
+from torch.distributed import init_process_group, destroy_process_group, barrier
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+    FullStateDictConfig,
+    MixedPrecision,
+    ShardingStrategy,
+    StateDictType
+)
+from .utils import Base, EXPECTED, EXPECTED_TRAIN
+from .utils import create_folder_if_necessary, safe_save, load_or_fail
+# pylint: disable=unused-argument
+class WarpCore(ABC):
+    @dataclass(frozen=True)
+    class Config(Base):
+        experiment_id: str = EXPECTED_TRAIN
+        checkpoint_path: str = EXPECTED_TRAIN
+        output_path: str = EXPECTED_TRAIN
+        checkpoint_extension: str = "safetensors"
+        dist_file_subfolder: str = ""
+        allow_tf32: bool = True
+        wandb_project: str = None
+        wandb_entity: str = None
+    @dataclass() # not frozen, means that fields are mutable
+    class Info(): # not inheriting from Base, because we don't want to enforce the default fields
+        wandb_run_id: str = None
+        total_steps: int = 0
+        iter: int = 0
+    @dataclass(frozen=True)
+    class Data(Base):
+        dataset: Dataset = EXPECTED
+        dataloader: DataLoader  = EXPECTED
+        iterator: any = EXPECTED
+    @dataclass(frozen=True)
+    class Models(Base):
+        pass
+    @dataclass(frozen=True)
+    class Optimizers(Base):
+        pass
+    @dataclass(frozen=True)
+    class Schedulers(Base):
+        pass
+    @dataclass(frozen=True)
+    class Extras(Base):
+        pass
+    # ---------------------------------------
+    info: Info
+    config: Config
+    # FSDP stuff
+    fsdp_defaults = {
+        "sharding_strategy": ShardingStrategy.SHARD_GRAD_OP,
+        "cpu_offload": None,
+        "mixed_precision": MixedPrecision(
+            param_dtype=torch.bfloat16,
+            reduce_dtype=torch.bfloat16,
+            buffer_dtype=torch.bfloat16,
+        ),
+        "limit_all_gathers": True,
+    }
+    fsdp_fullstate_save_policy = FullStateDictConfig(
+        offload_to_cpu=True, rank0_only=True
+    )
+    # ------------
+    # OVERRIDEABLE METHODS
+    # [optionally] setup extra stuff, will be called BEFORE the models & optimizers are setup
+    def setup_extras_pre(self) -> Extras:
+        return self.Extras()
+    # setup dataset & dataloader, return a dict contained dataser, dataloader and/or iterator
+    @abstractmethod
+    def setup_data(self, extras: Extras) -> Data:
+        raise NotImplementedError("This method needs to be overriden")
+    # return a dict with all models that are going to be used in the training
+    @abstractmethod
+    def setup_models(self, extras: Extras) -> Models:
+        raise NotImplementedError("This method needs to be overriden")
+    # return a dict with all optimizers that are going to be used in the training
+    @abstractmethod
+    def setup_optimizers(self, extras: Extras, models: Models) -> Optimizers:
+        raise NotImplementedError("This method needs to be overriden")
+    # [optionally] return a dict with all schedulers that are going to be used in the training
+    def setup_schedulers(self, extras: Extras, models: Models, optimizers: Optimizers) -> Schedulers:
+        return self.Schedulers()
+    # [optionally] setup extra stuff, will be called AFTER the models & optimizers are setup
+    def setup_extras_post(self, extras: Extras, models: Models, optimizers: Optimizers, schedulers: Schedulers) -> Extras:
+        return self.Extras.from_dict(extras.to_dict())
+    # perform the training here
+    @abstractmethod
+    def train(self, data: Data, extras: Extras, models: Models, optimizers: Optimizers, schedulers: Schedulers):
+        raise NotImplementedError("This method needs to be overriden")
+    # ------------
+    def setup_info(self, full_path=None) -> Info:
+        if full_path is None:
+            full_path = (f"{self.config.checkpoint_path}/{self.config.experiment_id}/info.json")
+        info_dict = load_or_fail(full_path, wandb_run_id=None) or {}
+        info_dto = self.Info(**info_dict)
+        if info_dto.total_steps > 0 and self.is_main_node:
+            print(">>> RESUMING TRAINING FROM ITER ", info_dto.total_steps)
+        return info_dto
+    def setup_config(self, config_file_path=None, config_dict=None, training=True) -> Config:
+        if config_file_path is not None:
+            if config_file_path.endswith(".yml") or config_file_path.endswith(".yaml"):
+                with open(config_file_path, "r", encoding="utf-8") as file:
+                    loaded_config = yaml.safe_load(file)
+            elif config_file_path.endswith(".json"):
+                with open(config_file_path, "r", encoding="utf-8") as file:
+                    loaded_config = json.load(file)
+            else:
+                raise ValueError("Config file must be either a .yml|.yaml or .json file")
+            return self.Config.from_dict({**loaded_config, 'training': training})
+        if config_dict is not None:
+            return self.Config.from_dict({**config_dict, 'training': training})
+        return self.Config(training=training)
+    def setup_ddp(self, experiment_id, single_gpu=False):
+        if not single_gpu:
+            local_rank = int(os.environ.get("SLURM_LOCALID"))
+            process_id = int(os.environ.get("SLURM_PROCID"))
+            world_size = int(os.environ.get("SLURM_NNODES")) * torch.cuda.device_count()
+            self.process_id = process_id
+            self.is_main_node = process_id == 0
+            self.device = torch.device(local_rank)
+            self.world_size = world_size
+            dist_file_path = f"{os.getcwd()}/{self.config.dist_file_subfolder}dist_file_{experiment_id}"
+            # if os.path.exists(dist_file_path) and self.is_main_node:
+            #     os.remove(dist_file_path)
+            torch.cuda.set_device(local_rank)
+            init_process_group(
+                backend="nccl",
+                rank=process_id,
+                world_size=world_size,
+                init_method=f"file://{dist_file_path}",
+            )
+            print(f"[GPU {process_id}] READY")
+        else:
+            print("Running in single thread, DDP not enabled.")
+    def setup_wandb(self):
+        if self.is_main_node and self.config.wandb_project is not None:
+            self.info.wandb_run_id = self.info.wandb_run_id or wandb.util.generate_id()
+            wandb.init(project=self.config.wandb_project, entity=self.config.wandb_entity, name=self.config.experiment_id, id=self.info.wandb_run_id, resume="allow", config=self.config.to_dict())
+            if self.info.total_steps > 0:
+                wandb.alert(title=f"Training {self.info.wandb_run_id} resumed", text=f"Training {self.info.wandb_run_id} resumed from step {self.info.total_steps}")
+            else:
+                wandb.alert(title=f"Training {self.info.wandb_run_id} started", text=f"Training {self.info.wandb_run_id} started")
+    # LOAD UTILITIES ----------
+    def load_model(self, model, model_id=None, full_path=None, strict=True):
+        if model_id is not None and full_path is None:
+            full_path = f"{self.config.checkpoint_path}/{self.config.experiment_id}/{model_id}.{self.config.checkpoint_extension}"
+        elif full_path is None and model_id is None:
+            raise ValueError(
+                "This method expects either 'model_id' or 'full_path' to be defined"
+            )
+        checkpoint = load_or_fail(full_path, wandb_run_id=self.info.wandb_run_id if self.is_main_node else None)
+        if checkpoint is not None:
+            model.load_state_dict(checkpoint, strict=strict)
+            del checkpoint
+        return model
+    def load_optimizer(self, optim, optim_id=None, full_path=None, fsdp_model=None):
+        if optim_id is not None and full_path is None:
+            full_path = f"{self.config.checkpoint_path}/{self.config.experiment_id}/{optim_id}.pt"
+        elif full_path is None and optim_id is None:
+            raise ValueError(
+                "This method expects either 'optim_id' or 'full_path' to be defined"
+            )
+        checkpoint = load_or_fail(full_path, wandb_run_id=self.info.wandb_run_id if self.is_main_node else None)
+        if checkpoint is not None:
+            try:
+                if fsdp_model is not None:
+                    sharded_optimizer_state_dict = (
+                        FSDP.scatter_full_optim_state_dict(  # <---- FSDP
+                            checkpoint
+                            if (
+                                self.is_main_node
+                                or self.fsdp_defaults["sharding_strategy"]
+                                == ShardingStrategy.NO_SHARD
+                            )
+                            else None,
+                            fsdp_model,
+                        )
+                    )
+                    optim.load_state_dict(sharded_optimizer_state_dict)
+                    del checkpoint, sharded_optimizer_state_dict
+                else:
+                    optim.load_state_dict(checkpoint)
+            # pylint: disable=broad-except
+            except Exception as e:
+                print("!!! Failed loading optimizer, skipping... Exception:", e)
+        return optim
+    # SAVE UTILITIES ----------
+    def save_info(self, info, suffix=""):
+        full_path = f"{self.config.checkpoint_path}/{self.config.experiment_id}/info{suffix}.json"
+        create_folder_if_necessary(full_path)
+        if self.is_main_node:
+            safe_save(vars(self.info), full_path)
+    def save_model(self, model, model_id=None, full_path=None, is_fsdp=False):
+        if model_id is not None and full_path is None:
+            full_path = f"{self.config.checkpoint_path}/{self.config.experiment_id}/{model_id}.{self.config.checkpoint_extension}"
+        elif full_path is None and model_id is None:
+            raise ValueError(
+                "This method expects either 'model_id' or 'full_path' to be defined"
+            )
+        create_folder_if_necessary(full_path)
+        if is_fsdp:
+            with FSDP.summon_full_params(model):
+                pass
+            with FSDP.state_dict_type(
+                model, StateDictType.FULL_STATE_DICT, self.fsdp_fullstate_save_policy
+            ):
+                checkpoint = model.state_dict()
+            if self.is_main_node:
+                safe_save(checkpoint, full_path)
+            del checkpoint
+        else:
+            if self.is_main_node:
+                checkpoint = model.state_dict()
+                safe_save(checkpoint, full_path)
+                del checkpoint
+    def save_optimizer(self, optim, optim_id=None, full_path=None, fsdp_model=None):
+        if optim_id is not None and full_path is None:
+            full_path = f"{self.config.checkpoint_path}/{self.config.experiment_id}/{optim_id}.pt"
+        elif full_path is None and optim_id is None:
+            raise ValueError(
+                "This method expects either 'optim_id' or 'full_path' to be defined"
+            )
+        create_folder_if_necessary(full_path)
+        if fsdp_model is not None:
+            optim_statedict = FSDP.full_optim_state_dict(fsdp_model, optim)
+            if self.is_main_node:
+                safe_save(optim_statedict, full_path)
+            del optim_statedict
+        else:
+            if self.is_main_node:
+                checkpoint = optim.state_dict()
+                safe_save(checkpoint, full_path)
+                del checkpoint
+    # -----
+    def __init__(self, config_file_path=None, config_dict=None, device="cpu", training=True):
+        # Temporary setup, will be overriden by setup_ddp if required
+        self.device = device
+        self.process_id = 0
+        self.is_main_node = True
+        self.world_size = 1
+        # ----
+        self.config: self.Config = self.setup_config(config_file_path, config_dict, training)
+        self.info: self.Info = self.setup_info()
+    def __call__(self, single_gpu=False):
+        self.setup_ddp(self.config.experiment_id, single_gpu=single_gpu)  # this will change the device to the CUDA rank
+        self.setup_wandb()
+        if self.config.allow_tf32:
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+        if self.is_main_node:
+            print()
+            print("**STARTIG JOB WITH CONFIG:**")
+            print(yaml.dump(self.config.to_dict(), default_flow_style=False))
+            print("------------------------------------")
+            print()
+            print("**INFO:**")
+            print(yaml.dump(vars(self.info), default_flow_style=False))
+            print("------------------------------------")
+            print()
+        # SETUP STUFF
+        extras = self.setup_extras_pre()
+        assert extras is not None, "setup_extras_pre() must return a DTO"
+        data = self.setup_data(extras)
+        assert data is not None, "setup_data() must return a DTO"
+        if self.is_main_node:
+            print("**DATA:**")
+            print(yaml.dump({k:type(v).__name__ for k, v in data.to_dict().items()}, default_flow_style=False))
+            print("------------------------------------")
+            print()
+        models = self.setup_models(extras)
+        assert models is not None, "setup_models() must return a DTO"
+        if self.is_main_node:
+            print("**MODELS:**")
+            print(yaml.dump({
+                k:f"{type(v).__name__} - {f'trainable params {sum(p.numel() for p in v.parameters() if p.requires_grad)}' if isinstance(v, nn.Module) else 'Not a nn.Module'}" for k, v in models.to_dict().items()
+            }, default_flow_style=False))
+            print("------------------------------------")
+            print()
+        optimizers = self.setup_optimizers(extras, models)
+        assert optimizers is not None, "setup_optimizers() must return a DTO"
+        if self.is_main_node:
+            print("**OPTIMIZERS:**")
+            print(yaml.dump({k:type(v).__name__ for k, v in optimizers.to_dict().items()}, default_flow_style=False))
+            print("------------------------------------")
+            print()
+        schedulers = self.setup_schedulers(extras, models, optimizers)
+        assert schedulers is not None, "setup_schedulers() must return a DTO"
+        if self.is_main_node:
+            print("**SCHEDULERS:**")
+            print(yaml.dump({k:type(v).__name__ for k, v in schedulers.to_dict().items()}, default_flow_style=False))
+            print("------------------------------------")
+            print()
+        post_extras =self.setup_extras_post(extras, models, optimizers, schedulers)
+        assert post_extras is not None, "setup_extras_post() must return a DTO"
+        extras = self.Extras.from_dict({ **extras.to_dict(),**post_extras.to_dict() })
+        if self.is_main_node:
+            print("**EXTRAS:**")
+            print(yaml.dump({k:f"{v}" for k, v in extras.to_dict().items()}, default_flow_style=False))
+            print("------------------------------------")
+            print()
+        # -------
+        # TRAIN
+        if self.is_main_node:
+            print("**TRAINING STARTING...**")
+        self.train(data, extras, models, optimizers, schedulers)
+        if single_gpu is False:
+            barrier()
+            destroy_process_group()
+        if self.is_main_node:
+            print()
+            print("------------------------------------")
+            print()
+            print("**TRAINING COMPLETE**")
+            if self.config.wandb_project is not None:
+                wandb.alert(title=f"Training {self.info.wandb_run_id} finished", text=f"Training {self.info.wandb_run_id} finished")

core/data/__init__.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import json
+import subprocess
+import yaml
+import os
+from .bucketeer import Bucketeer
+class MultiFilter():
+    def __init__(self, rules, default=False):
+        self.rules = rules
+        self.default = default
+    def __call__(self, x):
+        try:
+            x_json = x['json']
+            if isinstance(x_json, bytes):
+                x_json = json.loads(x_json)
+            validations = []
+            for k, r in self.rules.items():
+                if isinstance(k, tuple):
+                    v = r(*[x_json[kv] for kv in k])
+                else:
+                    v = r(x_json[k])
+                validations.append(v)
+            return all(validations)
+        except Exception:
+            return False
+class MultiGetter():
+    def __init__(self, rules):
+        self.rules = rules
+    def __call__(self, x_json):
+        if isinstance(x_json, bytes):
+            x_json = json.loads(x_json)
+        outputs = []
+        for k, r in self.rules.items():
+            if isinstance(k, tuple):
+                v = r(*[x_json[kv] for kv in k])
+            else:
+                v = r(x_json[k])
+            outputs.append(v)
+        if len(outputs) == 1:
+            outputs = outputs[0]
+        return outputs
+def setup_webdataset_path(paths, cache_path=None):
+    if cache_path is None or not os.path.exists(cache_path):
+        tar_paths = []
+        if isinstance(paths, str):
+            paths = [paths]
+        for path in paths:
+            if path.strip().endswith(".tar"):
+                # Avoid looking up s3 if we already have a tar file
+                tar_paths.append(path)
+                continue
+            bucket = "/".join(path.split("/")[:3])
+            result = subprocess.run([f"aws s3 ls {path} --recursive | awk '{{print $4}}'"], stdout=subprocess.PIPE, shell=True, check=True)
+            files = result.stdout.decode('utf-8').split()
+            files = [f"{bucket}/{f}" for f in files if f.endswith(".tar")]
+            tar_paths += files
+        with open(cache_path, 'w', encoding='utf-8') as outfile:
+            yaml.dump(tar_paths, outfile, default_flow_style=False)
+    else:
+        with open(cache_path, 'r', encoding='utf-8') as file:
+            tar_paths = yaml.safe_load(file)
+    tar_paths_str = ",".join([f"{p}" for p in tar_paths])
+    return f"pipe:aws s3 cp {{ {tar_paths_str} }} -"

core/data/bucketeer.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import torchvision
+import numpy as np
+from torchtools.transforms import SmartCrop
+import math
+class Bucketeer():
+    def __init__(self, dataloader, density=256*256, factor=8, ratios=[1/1, 1/2, 3/4, 3/5, 4/5, 6/9, 9/16], reverse_list=True, randomize_p=0.3, randomize_q=0.2, crop_mode='random', p_random_ratio=0.0, interpolate_nearest=False):
+        assert crop_mode in ['center', 'random', 'smart']
+        self.crop_mode = crop_mode
+        self.ratios = ratios
+        if reverse_list:
+            for r in list(ratios):
+                if 1/r not in self.ratios:
+                    self.ratios.append(1/r)
+        self.sizes = [(int(((density/r)**0.5//factor)*factor), int(((density*r)**0.5//factor)*factor)) for r in ratios]
+        self.batch_size = dataloader.batch_size
+        self.iterator = iter(dataloader)
+        self.buckets = {s: [] for s in self.sizes}
+        self.smartcrop = SmartCrop(int(density**0.5), randomize_p, randomize_q) if self.crop_mode=='smart' else None
+        self.p_random_ratio = p_random_ratio
+        self.interpolate_nearest = interpolate_nearest
+    def get_available_batch(self):
+        for b in self.buckets:
+            if len(self.buckets[b]) >= self.batch_size:
+                batch = self.buckets[b][:self.batch_size]
+                self.buckets[b] = self.buckets[b][self.batch_size:]
+                return batch
+        return None
+    def get_closest_size(self, x):
+        if self.p_random_ratio > 0 and np.random.rand() < self.p_random_ratio:
+            best_size_idx = np.random.randint(len(self.ratios))
+        else:
+            w, h = x.size(-1), x.size(-2)
+            best_size_idx = np.argmin([abs(w/h-r) for r in self.ratios])
+        return self.sizes[best_size_idx]
+    def get_resize_size(self, orig_size, tgt_size):
+        if (tgt_size[1]/tgt_size[0] - 1) * (orig_size[1]/orig_size[0] - 1) >= 0:
+            alt_min = int(math.ceil(max(tgt_size)*min(orig_size)/max(orig_size)))
+            resize_size = max(alt_min, min(tgt_size))
+        else:
+            alt_max = int(math.ceil(min(tgt_size)*max(orig_size)/min(orig_size)))
+            resize_size = max(alt_max, max(tgt_size))
+        return resize_size
+    def __next__(self):
+        batch = self.get_available_batch()
+        while batch is None:
+            elements = next(self.iterator)
+            for dct in elements:
+                img = dct['images']
+                size = self.get_closest_size(img)
+                resize_size = self.get_resize_size(img.shape[-2:], size)
+                if self.interpolate_nearest:
+                    img = torchvision.transforms.functional.resize(img, resize_size, interpolation=torchvision.transforms.InterpolationMode.NEAREST)
+                else:
+                    img = torchvision.transforms.functional.resize(img, resize_size, interpolation=torchvision.transforms.InterpolationMode.BILINEAR, antialias=True)
+                if self.crop_mode == 'center':
+                    img = torchvision.transforms.functional.center_crop(img, size)
+                elif self.crop_mode == 'random':
+                    img = torchvision.transforms.RandomCrop(size)(img)
+                elif self.crop_mode == 'smart':
+                    self.smartcrop.output_size = size
+                    img = self.smartcrop(img)
+                self.buckets[size].append({**{'images': img}, **{k:dct[k] for k in dct if k != 'images'}})
+            batch = self.get_available_batch()
+        out = {k:[batch[i][k] for i in range(len(batch))] for k in batch[0]}
+        return {k: torch.stack(o, dim=0) if isinstance(o[0], torch.Tensor) else o for k, o in out.items()}

core/scripts/__init__.py ADDED Viewed

File without changes

core/scripts/cli.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import sys
+import argparse
+from .. import WarpCore
+from .. import templates
+def template_init(args):
+    return ''''
+    '''.strip()
+def init_template(args):
+    parser = argparse.ArgumentParser(description='WarpCore template init tool')
+    parser.add_argument('-t', '--template', type=str, default='WarpCore')
+    args = parser.parse_args(args)
+    if args.template == 'WarpCore':
+        template_cls = WarpCore
+    else:
+        try:
+            template_cls = __import__(args.template)
+        except ModuleNotFoundError:
+            template_cls = getattr(templates, args.template)
+    print(template_cls)
+def main():
+    if len(sys.argv) < 2:
+        print('Usage: core <command>')
+        sys.exit(1)
+    if sys.argv[1] == 'init':
+        init_template(sys.argv[2:])
+    else:
+        print('Unknown command')
+        sys.exit(1)
+if __name__ == '__main__':
+    main()

core/templates/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .diffusion import DiffusionCore

core/templates/diffusion.py ADDED Viewed

	@@ -0,0 +1,236 @@

+from .. import WarpCore
+from ..utils import EXPECTED, EXPECTED_TRAIN, update_weights_ema, create_folder_if_necessary
+from abc import abstractmethod
+from dataclasses import dataclass
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from gdf import GDF
+import numpy as np
+from tqdm import tqdm
+import wandb
+import webdataset as wds
+from webdataset.handlers import warn_and_continue
+from torch.distributed import barrier
+from enum import Enum
+class TargetReparametrization(Enum):
+    EPSILON = 'epsilon'
+    X0 = 'x0'
+class DiffusionCore(WarpCore):
+    @dataclass(frozen=True)
+    class Config(WarpCore.Config):
+        # TRAINING PARAMS
+        lr: float = EXPECTED_TRAIN
+        grad_accum_steps: int = EXPECTED_TRAIN
+        batch_size: int = EXPECTED_TRAIN
+        updates: int = EXPECTED_TRAIN
+        warmup_updates: int = EXPECTED_TRAIN
+        save_every: int = 500
+        backup_every: int = 20000
+        use_fsdp: bool = True
+        # EMA UPDATE
+        ema_start_iters: int = None
+        ema_iters: int = None
+        ema_beta: float = None
+        # GDF setting
+        gdf_target_reparametrization: TargetReparametrization = None # epsilon or x0
+    @dataclass() # not frozen, means that fields are mutable. Doesn't support EXPECTED
+    class Info(WarpCore.Info):
+        ema_loss: float = None
+    @dataclass(frozen=True)
+    class Models(WarpCore.Models):
+        generator : nn.Module = EXPECTED
+        generator_ema : nn.Module = None # optional
+    @dataclass(frozen=True)
+    class Optimizers(WarpCore.Optimizers):
+        generator : any = EXPECTED
+    @dataclass(frozen=True)
+    class Schedulers(WarpCore.Schedulers):
+        generator: any = None
+    @dataclass(frozen=True)
+    class Extras(WarpCore.Extras):
+        gdf: GDF = EXPECTED
+        sampling_configs: dict = EXPECTED
+    # --------------------------------------------
+    info: Info
+    config: Config
+    @abstractmethod
+    def encode_latents(self, batch: dict, models: Models, extras: Extras) -> torch.Tensor:
+        raise NotImplementedError("This method needs to be overriden")
+    @abstractmethod
+    def decode_latents(self, latents: torch.Tensor, batch: dict, models: Models, extras: Extras) -> torch.Tensor:
+        raise NotImplementedError("This method needs to be overriden")
+    @abstractmethod
+    def get_conditions(self, batch: dict, models: Models, extras: Extras, is_eval=False, is_unconditional=False):
+        raise NotImplementedError("This method needs to be overriden")
+    @abstractmethod
+    def webdataset_path(self, extras: Extras):
+        raise NotImplementedError("This method needs to be overriden")
+    @abstractmethod
+    def webdataset_filters(self, extras: Extras):
+        raise NotImplementedError("This method needs to be overriden")
+    @abstractmethod
+    def webdataset_preprocessors(self, extras: Extras):
+        raise NotImplementedError("This method needs to be overriden")
+    @abstractmethod
+    def sample(self, models: Models, data: WarpCore.Data, extras: Extras):
+        raise NotImplementedError("This method needs to be overriden")
+    # -------------
+    def setup_data(self, extras: Extras) -> WarpCore.Data:
+        # SETUP DATASET
+        dataset_path = self.webdataset_path(extras)
+        preprocessors = self.webdataset_preprocessors(extras)
+        filters = self.webdataset_filters(extras)
+        handler = warn_and_continue # None
+        # handler = None
+        dataset = wds.WebDataset(
+            dataset_path, resampled=True, handler=handler
+        ).select(filters).shuffle(690, handler=handler).decode(
+            "pilrgb", handler=handler
+        ).to_tuple(
+            *[p[0] for p in preprocessors], handler=handler
+        ).map_tuple(
+            *[p[1] for p in preprocessors], handler=handler
+        ).map(lambda x: {p[2]:x[i] for i, p in enumerate(preprocessors)})
+        # SETUP DATALOADER
+        real_batch_size = self.config.batch_size//(self.world_size*self.config.grad_accum_steps)
+        dataloader = DataLoader(
+            dataset, batch_size=real_batch_size, num_workers=8, pin_memory=True
+        )
+        return self.Data(dataset=dataset, dataloader=dataloader, iterator=iter(dataloader))
+    def forward_pass(self, data: WarpCore.Data, extras: Extras, models: Models):
+        batch = next(data.iterator)
+        with torch.no_grad():
+            conditions = self.get_conditions(batch, models, extras)
+            latents = self.encode_latents(batch, models, extras)
+            noised, noise, target, logSNR, noise_cond, loss_weight = extras.gdf.diffuse(latents, shift=1, loss_shift=1)
+        # FORWARD PASS
+        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+            pred = models.generator(noised, noise_cond, **conditions)
+            if self.config.gdf_target_reparametrization == TargetReparametrization.EPSILON:
+                pred = extras.gdf.undiffuse(noised, logSNR, pred)[1] # transform whatever prediction to epsilon to use in the loss
+                target = noise
+            elif self.config.gdf_target_reparametrization == TargetReparametrization.X0:
+                pred = extras.gdf.undiffuse(noised, logSNR, pred)[0] # transform whatever prediction to x0 to use in the loss
+                target = latents
+            loss = nn.functional.mse_loss(pred, target, reduction='none').mean(dim=[1, 2, 3])
+            loss_adjusted = (loss * loss_weight).mean() / self.config.grad_accum_steps
+        return loss, loss_adjusted
+    def train(self, data: WarpCore.Data, extras: Extras, models: Models, optimizers: Optimizers, schedulers: Schedulers):
+        start_iter = self.info.iter+1
+        max_iters = self.config.updates * self.config.grad_accum_steps
+        if self.is_main_node:
+            print(f"STARTING AT STEP: {start_iter}/{max_iters}")
+        pbar = tqdm(range(start_iter, max_iters+1)) if self.is_main_node else range(start_iter, max_iters+1) # <--- DDP
+        models.generator.train()
+        for i in pbar:
+            # FORWARD PASS
+            loss, loss_adjusted = self.forward_pass(data, extras, models)
+            # BACKWARD PASS
+            if i % self.config.grad_accum_steps == 0 or i == max_iters:
+                loss_adjusted.backward()
+                grad_norm = nn.utils.clip_grad_norm_(models.generator.parameters(), 1.0)
+                optimizers_dict = optimizers.to_dict()
+                for k in optimizers_dict:
+                    optimizers_dict[k].step()
+                schedulers_dict = schedulers.to_dict()
+                for k in schedulers_dict:
+                    schedulers_dict[k].step()
+                models.generator.zero_grad(set_to_none=True)
+                self.info.total_steps += 1
+            else:
+                with models.generator.no_sync():
+                    loss_adjusted.backward()
+            self.info.iter = i
+            # UPDATE EMA
+            if models.generator_ema is not None and i % self.config.ema_iters == 0:
+                update_weights_ema(
+                    models.generator_ema, models.generator,
+                    beta=(self.config.ema_beta if i > self.config.ema_start_iters else 0)
+                )
+            # UPDATE LOSS METRICS
+            self.info.ema_loss = loss.mean().item() if self.info.ema_loss is None else self.info.ema_loss * 0.99 + loss.mean().item() * 0.01
+            if self.is_main_node and self.config.wandb_project is not None and np.isnan(loss.mean().item()) or np.isnan(grad_norm.item()):
+                wandb.alert(
+                    title=f"NaN value encountered in training run {self.info.wandb_run_id}",
+                    text=f"Loss {loss.mean().item()} - Grad Norm {grad_norm.item()}. Run {self.info.wandb_run_id}",
+                    wait_duration=60*30
+                )
+            if self.is_main_node:
+                logs = {
+                    'loss': self.info.ema_loss,
+                    'raw_loss': loss.mean().item(),
+                    'grad_norm': grad_norm.item(),
+                    'lr': optimizers.generator.param_groups[0]['lr'],
+                    'total_steps': self.info.total_steps,
+                }
+                pbar.set_postfix(logs)
+                if self.config.wandb_project is not None:
+                    wandb.log(logs)
+            if i == 1 or i % (self.config.save_every*self.config.grad_accum_steps) == 0 or i == max_iters:
+                # SAVE AND CHECKPOINT STUFF
+                if np.isnan(loss.mean().item()):
+                    if self.is_main_node and self.config.wandb_project is not None:
+                        tqdm.write("Skipping sampling & checkpoint because the loss is NaN")
+                        wandb.alert(title=f"Skipping sampling & checkpoint for training run {self.config.run_id}", text=f"Skipping sampling & checkpoint at {self.info.total_steps} for training run {self.info.wandb_run_id} iters because loss is NaN")
+                else:
+                    self.save_checkpoints(models, optimizers)
+                    if self.is_main_node:
+                        create_folder_if_necessary(f'{self.config.output_path}/{self.config.experiment_id}/')
+                    self.sample(models, data, extras)
+    def models_to_save(self):
+        return ['generator', 'generator_ema']
+    def save_checkpoints(self, models: Models, optimizers: Optimizers, suffix=None):
+        barrier()
+        suffix = '' if suffix is None else suffix
+        self.save_info(self.info, suffix=suffix)
+        models_dict = models.to_dict()
+        optimizers_dict = optimizers.to_dict()
+        for key in self.models_to_save():
+            model = models_dict[key]
+            if model is not None:
+                self.save_model(model, f"{key}{suffix}", is_fsdp=self.config.use_fsdp)
+        for key in optimizers_dict:
+            optimizer = optimizers_dict[key]
+            if optimizer is not None:
+                self.save_optimizer(optimizer, f'{key}_optim{suffix}', fsdp_model=models.generator if self.config.use_fsdp else None)
+        if suffix == '' and self.info.total_steps > 1 and self.info.total_steps % self.config.backup_every == 0:
+            self.save_checkpoints(models, optimizers, suffix=f"_{self.info.total_steps//1000}k")
+        torch.cuda.empty_cache()

core/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .base_dto import Base, nested_dto, EXPECTED, EXPECTED_TRAIN
+from .save_and_load import create_folder_if_necessary, safe_save, load_or_fail
+# MOVE IT SOMERWHERE ELSE
+def update_weights_ema(tgt_model, src_model, beta=0.999):
+    for self_params, src_params in zip(tgt_model.parameters(), src_model.parameters()):
+        self_params.data = self_params.data * beta + src_params.data.clone().to(self_params.device) * (1-beta)
+    for self_buffers, src_buffers in zip(tgt_model.buffers(), src_model.buffers()):
+        self_buffers.data = self_buffers.data * beta + src_buffers.data.clone().to(self_buffers.device) * (1-beta)

core/utils/base_dto.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import dataclasses
+from dataclasses import dataclass, _MISSING_TYPE
+from munch import Munch
+EXPECTED = "___REQUIRED___"
+EXPECTED_TRAIN = "___REQUIRED_TRAIN___"
+# pylint: disable=invalid-field-call
+def nested_dto(x, raw=False):
+    return dataclasses.field(default_factory=lambda: x if raw else Munch.fromDict(x))
+@dataclass(frozen=True)
+class Base:
+    training: bool = None
+    def __new__(cls, **kwargs):
+        training = kwargs.get('training', True)
+        setteable_fields = cls.setteable_fields(**kwargs)
+        mandatory_fields = cls.mandatory_fields(**kwargs)
+        invalid_kwargs = [
+            {k: v} for k, v in kwargs.items() if k not in setteable_fields or v == EXPECTED or (v == EXPECTED_TRAIN and training is not False)
+        ]
+        print(mandatory_fields)
+        assert (
+            len(invalid_kwargs) == 0
+        ), f"Invalid fields detected when initializing this DTO: {invalid_kwargs}.\nDeclare this field and set it to None or EXPECTED in order to make it setteable."
+        missing_kwargs = [f for f in mandatory_fields if f not in kwargs]
+        assert (
+            len(missing_kwargs) == 0
+        ), f"Required fields missing initializing this DTO: {missing_kwargs}."
+        return object.__new__(cls)
+    @classmethod
+    def setteable_fields(cls, **kwargs):
+        return [f.name for f in dataclasses.fields(cls) if f.default is None or isinstance(f.default, _MISSING_TYPE) or f.default == EXPECTED or f.default == EXPECTED_TRAIN]
+    @classmethod
+    def mandatory_fields(cls, **kwargs):
+        training = kwargs.get('training', True)
+        return [f.name for f in dataclasses.fields(cls) if isinstance(f.default, _MISSING_TYPE) and isinstance(f.default_factory, _MISSING_TYPE) or f.default == EXPECTED or (f.default == EXPECTED_TRAIN and training is not False)]
+    @classmethod
+    def from_dict(cls, kwargs):
+        for k in kwargs:
+            if isinstance(kwargs[k], (dict, list, tuple)):
+                kwargs[k] = Munch.fromDict(kwargs[k])
+        return cls(**kwargs)
+    def to_dict(self):
+        # selfdict = dataclasses.asdict(self) # needs to pickle stuff, doesn't support some more complex classes
+        selfdict = {}
+        for k in dataclasses.fields(self):
+            selfdict[k.name] = getattr(self, k.name)
+            if isinstance(selfdict[k.name], Munch):
+                selfdict[k.name] = selfdict[k.name].toDict()
+        return selfdict

core/utils/save_and_load.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import torch
+import json
+from pathlib import Path
+import safetensors
+import wandb
+def create_folder_if_necessary(path):
+    path = "/".join(path.split("/")[:-1])
+    Path(path).mkdir(parents=True, exist_ok=True)
+def safe_save(ckpt, path):
+    try:
+        os.remove(f"{path}.bak")
+    except OSError:
+        pass
+    try:
+        os.rename(path, f"{path}.bak")
+    except OSError:
+        pass
+    if path.endswith(".pt") or path.endswith(".ckpt"):
+        torch.save(ckpt, path)
+    elif path.endswith(".json"):
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(ckpt, f, indent=4)
+    elif path.endswith(".safetensors"):
+        safetensors.torch.save_file(ckpt, path)
+    else:
+        raise ValueError(f"File extension not supported: {path}")
+def load_or_fail(path, wandb_run_id=None):
+    accepted_extensions = [".pt", ".ckpt", ".json", ".safetensors"]
+    try:
+        assert any(
+            [path.endswith(ext) for ext in accepted_extensions]
+        ), f"Automatic loading not supported for this extension: {path}"
+        if not os.path.exists(path):
+            checkpoint = None
+        elif path.endswith(".pt") or path.endswith(".ckpt"):
+            checkpoint = torch.load(path, map_location="cpu")
+        elif path.endswith(".json"):
+            with open(path, "r", encoding="utf-8") as f:
+                checkpoint = json.load(f)
+        elif path.endswith(".safetensors"):
+            checkpoint = {}
+            with safetensors.safe_open(path, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    checkpoint[key] = f.get_tensor(key)
+        return checkpoint
+    except Exception as e:
+        if wandb_run_id is not None:
+            wandb.alert(
+                title=f"Corrupt checkpoint for run {wandb_run_id}",
+                text=f"Training {wandb_run_id} tried to load checkpoint {path} and failed",
+            )
+        raise e