isatis commited on
Commit
f5210ab
1 Parent(s): 15ff747

add custom handler

Browse files
embeddings/EasyNegative.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c74b4e810b030f6b75fde959e2db678c268d07115b85356d3c0138ba5eb42340
3
+ size 24655
embeddings/NegfeetV2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df90b1ff666d80a7e3b07831761d8a41ea40b41d38a796cb68a71d1e12772ca6
3
+ size 25442
embeddings/bad-artist-anime.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f7bea88750c97a0b8c9ba9f5bc0d13648c3a17a69aaac855903229d5f58c34b
3
+ size 7083
embeddings/bad-hands-5.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa7651be154c46a2f4868788ef84a92b3083b0c0c5c46f5012a56698bfd2a1ba
3
+ size 7083
embeddings/bad_prompt_version2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f35e7dd816ae04bb3f774a9a17ebfbc50c0e3a53f69a9a40bed05936d3a3812
3
+ size 25515
embeddings/badhandv4.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e40d722fc3d0c2decb62debfaf8058db30ccdae9ab00ff64b183907b435708e
3
+ size 19371
embeddings/nartfixer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0504f05844290ac4d2de41d0338fb642548fb18efd8c6de7bb571ab1d60af89
3
+ size 82783
embeddings/ng_deepnegative_v1_75t.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54e7e4826d53949a3d0dde40aea023b1e456a618c608a7630e3999fd38f93245
3
+ size 231339
embeddings/polyhedron_skinny_all.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:210b1ee059ef769cff1df73b119ffe3209ace2ceb01dd4aaa8649fc509108534
3
+ size 302262993
feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "feature_extractor_type": "CLIPFeatureExtractor",
12
+ "image_mean": [
13
+ 0.48145466,
14
+ 0.4578275,
15
+ 0.40821073
16
+ ],
17
+ "image_processor_type": "CLIPImageProcessor",
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 224
27
+ }
28
+ }
handler.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import sys
4
+ from collections import defaultdict
5
+ from io import BytesIO
6
+ from pprint import pprint
7
+ from typing import Any, Dict, List
8
+
9
+ import torch
10
+ from diffusers import (
11
+ DiffusionPipeline,
12
+ DPMSolverMultistepScheduler,
13
+ DPMSolverSinglestepScheduler,
14
+ EulerAncestralDiscreteScheduler,
15
+ )
16
+ from safetensors.torch import load_file
17
+ from torch import autocast
18
+
19
+ # https://huggingface.co/philschmid/stable-diffusion-v1-4-endpoints
20
+ # https://huggingface.co/docs/inference-endpoints/guides/custom_handler
21
+
22
+
23
+ # set device
24
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+
26
+ if device.type != "cuda":
27
+ raise ValueError("need to run on GPU")
28
+
29
+
30
+ class EndpointHandler:
31
+ LORA_PATHS = {
32
+ "hairdetailer": r"lora/hairdetailer.safetensors",
33
+ "lora_leica": r"lora/lora_leica.safetensors",
34
+ "epiNoiseoffset_v2": r"lora/epiNoiseoffset_v2.safetensors",
35
+ "MBHU-TT2FRS": r"lora/MBHU-TT2FRS.safetensors",
36
+ "ShinyOiledSkin_v20": r"lora/ShinyOiledSkin_v20-LoRA.safetensors",
37
+ "polyhedron_new_skin_v1.1": r"lora/polyhedron_new_skin_v1.1.safetensors",
38
+ "detailed_eye-10": r"lora/detailed_eye-10.safetensors",
39
+ "add_detail": r"lora/add_detail.safetensors",
40
+ "MuscleGirl_v1": r"lora/MuscleGirl_v1.safetensors",
41
+ }
42
+
43
+ TEXTUAL_INVERSION = [
44
+ {
45
+ "weight_name": "embeddings/EasyNegative.safetensors",
46
+ "token": "easynegative",
47
+ },
48
+ {
49
+ "weight_name": "embeddings/EasyNegative.safetensors",
50
+ "token": "EasyNegative",
51
+ },
52
+ {"weight_name": "embeddings/badhandv4.pt", "token": "badhandv4"},
53
+ {
54
+ "weight_name": "embeddings/bad-artist-anime.pt",
55
+ "token": "bad-artist-anime",
56
+ },
57
+ {"weight_name": "embeddings/NegfeetV2.pt", "token": "NegfeetV2"},
58
+ {
59
+ "weight_name": "embeddings/ng_deepnegative_v1_75t.pt",
60
+ "token": "ng_deepnegative_v1_75t",
61
+ },
62
+ {
63
+ "weight_name": "embeddings/ng_deepnegative_v1_75t.pt",
64
+ "token": "NG_DeepNegative_V1_75T",
65
+ },
66
+ {"weight_name": "embeddings/bad-hands-5.pt", "token": "bad-hands-5"},
67
+ ]
68
+
69
+ def __init__(self, path="."):
70
+ # load the optimized model
71
+ self.pipe = DiffusionPipeline.from_pretrained(
72
+ path,
73
+ custom_pipeline="lpw_stable_diffusion", # avoid 77 token limit
74
+ torch_dtype=torch.float16, # accelerate render
75
+ )
76
+ self.pipe = self.pipe.to(device)
77
+
78
+ # DPM++ 2M SDE Karras
79
+ # increase step to avoid high contrast num_inference_steps=30
80
+ self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(
81
+ self.pipe.scheduler.config,
82
+ use_karras_sigmas=True,
83
+ algorithm_type="sde-dpmsolver++",
84
+ )
85
+
86
+ # Mode boulardus
87
+ self.pipe.safety_checker = None
88
+
89
+ # Load negative embeddings to avoid bad hands, etc
90
+ self.load_embeddings()
91
+
92
+ # Load default Lora models
93
+ self.pipe = self.load_selected_loras(
94
+ [
95
+ ("polyhedron_new_skin_v1.1", 0.35), # nice Skin
96
+ ("detailed_eye-10", 0.3), # nice eyes
97
+ ("add_detail", 0.4), # detailed pictures
98
+ ("MuscleGirl_v1", 0.3), # shape persons
99
+ ],
100
+ )
101
+
102
+ # boosts performance by another 20%
103
+ self.pipe.enable_xformers_memory_efficient_attention()
104
+ self.pipe.enable_attention_slicing()
105
+
106
+ def load_lora(self, pipeline, lora_path, lora_weight=0.5):
107
+ state_dict = load_file(lora_path)
108
+ LORA_PREFIX_UNET = "lora_unet"
109
+ LORA_PREFIX_TEXT_ENCODER = "lora_te"
110
+
111
+ alpha = lora_weight
112
+ visited = []
113
+
114
+ for key in state_dict:
115
+ state_dict[key] = state_dict[key].to(device)
116
+
117
+ # directly update weight in diffusers model
118
+ for key in state_dict:
119
+ # as we have set the alpha beforehand, so just skip
120
+ if ".alpha" in key or key in visited:
121
+ continue
122
+
123
+ if "text" in key:
124
+ layer_infos = (
125
+ key.split(".")[0]
126
+ .split(LORA_PREFIX_TEXT_ENCODER + "_")[-1]
127
+ .split("_")
128
+ )
129
+ curr_layer = pipeline.text_encoder
130
+ else:
131
+ layer_infos = (
132
+ key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
133
+ )
134
+ curr_layer = pipeline.unet
135
+
136
+ # find the target layer
137
+ temp_name = layer_infos.pop(0)
138
+ while len(layer_infos) > -1:
139
+ try:
140
+ curr_layer = curr_layer.__getattr__(temp_name)
141
+ if len(layer_infos) > 0:
142
+ temp_name = layer_infos.pop(0)
143
+ elif len(layer_infos) == 0:
144
+ break
145
+ except Exception:
146
+ if len(temp_name) > 0:
147
+ temp_name += "_" + layer_infos.pop(0)
148
+ else:
149
+ temp_name = layer_infos.pop(0)
150
+
151
+ # org_forward(x) + lora_up(lora_down(x)) * multiplier
152
+ pair_keys = []
153
+ if "lora_down" in key:
154
+ pair_keys.append(key.replace("lora_down", "lora_up"))
155
+ pair_keys.append(key)
156
+ else:
157
+ pair_keys.append(key)
158
+ pair_keys.append(key.replace("lora_up", "lora_down"))
159
+
160
+ # update weight
161
+ if len(state_dict[pair_keys[0]].shape) == 4:
162
+ weight_up = (
163
+ state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
164
+ )
165
+ weight_down = (
166
+ state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
167
+ )
168
+ curr_layer.weight.data += alpha * torch.mm(
169
+ weight_up, weight_down
170
+ ).unsqueeze(2).unsqueeze(3)
171
+ else:
172
+ weight_up = state_dict[pair_keys[0]].to(torch.float32)
173
+ weight_down = state_dict[pair_keys[1]].to(torch.float32)
174
+ curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down)
175
+
176
+ # update visited list
177
+ for item in pair_keys:
178
+ visited.append(item)
179
+
180
+ return pipeline
181
+
182
+ def load_embeddings(self):
183
+ """Load textual inversions, avoid bad prompts"""
184
+ for model in EndpointHandler.TEXTUAL_INVERSION:
185
+ self.pipe.load_textual_inversion(
186
+ ".", weight_name=model["weight_name"], token=model["token"]
187
+ )
188
+
189
+ def load_selected_loras(self, selections):
190
+ """Load Loras models, can lead to marvelous creations"""
191
+ for model_name, weight in selections:
192
+ lora_path = EndpointHandler.LORA_PATHS[model_name]
193
+ self.pipe = self.load_lora(
194
+ pipeline=self.pipe, lora_path=lora_path, lora_weight=weight
195
+ )
196
+ return self.pipe
197
+
198
+ def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
199
+ """
200
+ Args:
201
+ data (:obj:):
202
+ includes the input data and the parameters for the inference.
203
+ Return:
204
+ A :obj:`dict`:. base64 encoded image
205
+ """
206
+ global device
207
+
208
+ # Which Lora do we load ?
209
+ # selected_models = [
210
+ # ("ShinyOiledSkin_v20", 0.3),
211
+ # ("MBHU-TT2FRS", 0.5),
212
+ # ("hairdetailer", 0.5),
213
+ # ("lora_leica", 0.5),
214
+ # ("epiNoiseoffset_v2", 0.5),
215
+ # ]
216
+
217
+ # 1. Verify input arguments
218
+ required_fields = [
219
+ "prompt",
220
+ "negative_prompt",
221
+ "width",
222
+ "num_inference_steps",
223
+ "height",
224
+ "seed",
225
+ "guidance_scale",
226
+ ]
227
+
228
+ missing_fields = [field for field in required_fields if field not in data]
229
+
230
+ if missing_fields:
231
+ return {
232
+ "flag": "error",
233
+ "message": f"Missing fields: {', '.join(missing_fields)}",
234
+ }
235
+
236
+ # Now extract the fields
237
+ prompt = data["prompt"]
238
+ negative_prompt = data["negative_prompt"]
239
+ loras_model = data.pop("loras_model", None)
240
+ seed = data["seed"]
241
+ width = data["width"]
242
+ num_inference_steps = data["num_inference_steps"]
243
+ height = data["height"]
244
+ guidance_scale = data["guidance_scale"]
245
+
246
+ # USe this to add automatically some negative prompts
247
+ forced_negative = (
248
+ negative_prompt
249
+ + """easynegative, badhandv4, bad-artist-anime, NegfeetV2, ng_deepnegative_v1_75t, bad-hands-5 """
250
+ )
251
+
252
+ # Set the generator seed if provided
253
+ generator = torch.Generator(device="cuda").manual_seed(seed) if seed else None
254
+
255
+ # Load the provided Lora models
256
+ if loras_model:
257
+ self.pipe = self.load_selected_loras(loras_model)
258
+
259
+ try:
260
+ # 2. Process
261
+ with autocast(device.type):
262
+ image = self.pipe.text2img(
263
+ prompt=prompt,
264
+ guidance_scale=guidance_scale,
265
+ num_inference_steps=num_inference_steps,
266
+ height=height,
267
+ width=width,
268
+ negative_prompt=forced_negative,
269
+ generator=generator,
270
+ max_embeddings_multiples=5,
271
+ ).images[0]
272
+
273
+ # encode image as base 64
274
+ buffered = BytesIO()
275
+ image.save(buffered, format="JPEG")
276
+ img_str = base64.b64encode(buffered.getvalue())
277
+
278
+ # Return the success response
279
+ return {"flag": "success", "image": img_str.decode()}
280
+
281
+ except Exception as e:
282
+ # Handle any other exceptions and return an error response
283
+ return {"flag": "error", "message": str(e)}
lora/FilmVelvia3.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac8b0e4aa77be4d8b83da9bafe0134a2e36504c9b5263a7030394cffe4f7003a
3
+ size 151108832
lora/InstantPhotoX3.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4969da4a70d0eaec5eb76aa6746d1a9b177c9fe58558878d9725b460c4a44b9
3
+ size 151108832
lora/MBHU-TT2FRS.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60845bdd338d468ab086d19180c24508b873f362445451b6b3c7ff91fab885bb
3
+ size 18582436
lora/MuscleGirl_v1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2e9e3a591ca9fd274e8a0752bb871323a23eb1dbb8cc15e358de448327c29b6
3
+ size 37863942
lora/ShinyOiledSkin_v20-LoRA.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:987a38bcee368844b21cdb9cb51101c863fa40c2643f19cb15ef3c762d6127da
3
+ size 75612660
lora/add_detail.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47aaaf0d2945ca937151d61304946dd229b3f072140b85484bc93e38f2a6e2f7
3
+ size 37861176
lora/detailed_eye-10.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c91b1a73ab1a8bb07bb540112818a5458f7364501e35d8748b63c483a18dd7f
3
+ size 18996195
lora/epiNoiseoffset_v2.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81680c064e9f50dfcc11ec5e25da1832f523ec84afd544f372c7786f3ddcbbac
3
+ size 81479800
lora/hairdetailer.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0efcd0fe6630156f8c7127f2ffe9d951bec1b2b2ee38b4de97cc865d86f7203
3
+ size 9548007
lora/lora_leica.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c360331ab917344da5180d95abedfabd185870d69b74ddd544c8ecc07b3c653d
3
+ size 151108831
lora/polyhedron_new_skin_v1.1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cbec5f3c0baf4f597b46043df01d7ec3210a751a9943bbf45e1602c447fd440
3
+ size 151128227
model_index.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableDiffusionPipeline",
3
+ "_diffusers_version": "0.20.0",
4
+ "feature_extractor": [
5
+ "transformers",
6
+ "CLIPImageProcessor"
7
+ ],
8
+ "requires_safety_checker": true,
9
+ "safety_checker": [
10
+ "stable_diffusion",
11
+ "StableDiffusionSafetyChecker"
12
+ ],
13
+ "scheduler": [
14
+ "diffusers",
15
+ "PNDMScheduler"
16
+ ],
17
+ "text_encoder": [
18
+ "transformers",
19
+ "CLIPTextModel"
20
+ ],
21
+ "tokenizer": [
22
+ "transformers",
23
+ "CLIPTokenizer"
24
+ ],
25
+ "unet": [
26
+ "diffusers",
27
+ "UNet2DConditionModel"
28
+ ],
29
+ "vae": [
30
+ "diffusers",
31
+ "AutoencoderKL"
32
+ ]
33
+ }
safety_checker/config.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": "cb41f3a270d63d454d385fc2e4f571c487c253c5",
3
+ "_name_or_path": "CompVis/stable-diffusion-safety-checker",
4
+ "architectures": [
5
+ "StableDiffusionSafetyChecker"
6
+ ],
7
+ "initializer_factor": 1.0,
8
+ "logit_scale_init_value": 2.6592,
9
+ "model_type": "clip",
10
+ "projection_dim": 768,
11
+ "text_config": {
12
+ "_name_or_path": "",
13
+ "add_cross_attention": false,
14
+ "architectures": null,
15
+ "attention_dropout": 0.0,
16
+ "bad_words_ids": null,
17
+ "begin_suppress_tokens": null,
18
+ "bos_token_id": 0,
19
+ "chunk_size_feed_forward": 0,
20
+ "cross_attention_hidden_size": null,
21
+ "decoder_start_token_id": null,
22
+ "diversity_penalty": 0.0,
23
+ "do_sample": false,
24
+ "dropout": 0.0,
25
+ "early_stopping": false,
26
+ "encoder_no_repeat_ngram_size": 0,
27
+ "eos_token_id": 2,
28
+ "exponential_decay_length_penalty": null,
29
+ "finetuning_task": null,
30
+ "forced_bos_token_id": null,
31
+ "forced_eos_token_id": null,
32
+ "hidden_act": "quick_gelu",
33
+ "hidden_size": 768,
34
+ "id2label": {
35
+ "0": "LABEL_0",
36
+ "1": "LABEL_1"
37
+ },
38
+ "initializer_factor": 1.0,
39
+ "initializer_range": 0.02,
40
+ "intermediate_size": 3072,
41
+ "is_decoder": false,
42
+ "is_encoder_decoder": false,
43
+ "label2id": {
44
+ "LABEL_0": 0,
45
+ "LABEL_1": 1
46
+ },
47
+ "layer_norm_eps": 1e-05,
48
+ "length_penalty": 1.0,
49
+ "max_length": 20,
50
+ "max_position_embeddings": 77,
51
+ "min_length": 0,
52
+ "model_type": "clip_text_model",
53
+ "no_repeat_ngram_size": 0,
54
+ "num_attention_heads": 12,
55
+ "num_beam_groups": 1,
56
+ "num_beams": 1,
57
+ "num_hidden_layers": 12,
58
+ "num_return_sequences": 1,
59
+ "output_attentions": false,
60
+ "output_hidden_states": false,
61
+ "output_scores": false,
62
+ "pad_token_id": 1,
63
+ "prefix": null,
64
+ "problem_type": null,
65
+ "projection_dim": 512,
66
+ "pruned_heads": {},
67
+ "remove_invalid_values": false,
68
+ "repetition_penalty": 1.0,
69
+ "return_dict": true,
70
+ "return_dict_in_generate": false,
71
+ "sep_token_id": null,
72
+ "suppress_tokens": null,
73
+ "task_specific_params": null,
74
+ "temperature": 1.0,
75
+ "tf_legacy_loss": false,
76
+ "tie_encoder_decoder": false,
77
+ "tie_word_embeddings": true,
78
+ "tokenizer_class": null,
79
+ "top_k": 50,
80
+ "top_p": 1.0,
81
+ "torch_dtype": null,
82
+ "torchscript": false,
83
+ "transformers_version": "4.25.1",
84
+ "typical_p": 1.0,
85
+ "use_bfloat16": false,
86
+ "vocab_size": 49408
87
+ },
88
+ "text_config_dict": {
89
+ "hidden_size": 768,
90
+ "intermediate_size": 3072,
91
+ "num_attention_heads": 12,
92
+ "num_hidden_layers": 12
93
+ },
94
+ "torch_dtype": "float32",
95
+ "transformers_version": null,
96
+ "vision_config": {
97
+ "_name_or_path": "",
98
+ "add_cross_attention": false,
99
+ "architectures": null,
100
+ "attention_dropout": 0.0,
101
+ "bad_words_ids": null,
102
+ "begin_suppress_tokens": null,
103
+ "bos_token_id": null,
104
+ "chunk_size_feed_forward": 0,
105
+ "cross_attention_hidden_size": null,
106
+ "decoder_start_token_id": null,
107
+ "diversity_penalty": 0.0,
108
+ "do_sample": false,
109
+ "dropout": 0.0,
110
+ "early_stopping": false,
111
+ "encoder_no_repeat_ngram_size": 0,
112
+ "eos_token_id": null,
113
+ "exponential_decay_length_penalty": null,
114
+ "finetuning_task": null,
115
+ "forced_bos_token_id": null,
116
+ "forced_eos_token_id": null,
117
+ "hidden_act": "quick_gelu",
118
+ "hidden_size": 1024,
119
+ "id2label": {
120
+ "0": "LABEL_0",
121
+ "1": "LABEL_1"
122
+ },
123
+ "image_size": 224,
124
+ "initializer_factor": 1.0,
125
+ "initializer_range": 0.02,
126
+ "intermediate_size": 4096,
127
+ "is_decoder": false,
128
+ "is_encoder_decoder": false,
129
+ "label2id": {
130
+ "LABEL_0": 0,
131
+ "LABEL_1": 1
132
+ },
133
+ "layer_norm_eps": 1e-05,
134
+ "length_penalty": 1.0,
135
+ "max_length": 20,
136
+ "min_length": 0,
137
+ "model_type": "clip_vision_model",
138
+ "no_repeat_ngram_size": 0,
139
+ "num_attention_heads": 16,
140
+ "num_beam_groups": 1,
141
+ "num_beams": 1,
142
+ "num_channels": 3,
143
+ "num_hidden_layers": 24,
144
+ "num_return_sequences": 1,
145
+ "output_attentions": false,
146
+ "output_hidden_states": false,
147
+ "output_scores": false,
148
+ "pad_token_id": null,
149
+ "patch_size": 14,
150
+ "prefix": null,
151
+ "problem_type": null,
152
+ "projection_dim": 512,
153
+ "pruned_heads": {},
154
+ "remove_invalid_values": false,
155
+ "repetition_penalty": 1.0,
156
+ "return_dict": true,
157
+ "return_dict_in_generate": false,
158
+ "sep_token_id": null,
159
+ "suppress_tokens": null,
160
+ "task_specific_params": null,
161
+ "temperature": 1.0,
162
+ "tf_legacy_loss": false,
163
+ "tie_encoder_decoder": false,
164
+ "tie_word_embeddings": true,
165
+ "tokenizer_class": null,
166
+ "top_k": 50,
167
+ "top_p": 1.0,
168
+ "torch_dtype": null,
169
+ "torchscript": false,
170
+ "transformers_version": "4.25.1",
171
+ "typical_p": 1.0,
172
+ "use_bfloat16": false
173
+ },
174
+ "vision_config_dict": {
175
+ "hidden_size": 1024,
176
+ "intermediate_size": 4096,
177
+ "num_attention_heads": 16,
178
+ "num_hidden_layers": 24,
179
+ "patch_size": 14
180
+ }
181
+ }
safety_checker/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16d28f2b37109f222cdc33620fdd262102ac32112be0352a7f77e9614b35a394
3
+ size 1216064769
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PNDMScheduler",
3
+ "_diffusers_version": "0.20.0",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "num_train_timesteps": 1000,
9
+ "prediction_type": "epsilon",
10
+ "set_alpha_to_one": false,
11
+ "skip_prk_steps": true,
12
+ "steps_offset": 1,
13
+ "timestep_spacing": "leading",
14
+ "trained_betas": null
15
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPTextModel"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 0,
7
+ "dropout": 0.0,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "quick_gelu",
10
+ "hidden_size": 768,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 77,
16
+ "model_type": "clip_text_model",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "projection_dim": 768,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.25.1",
23
+ "vocab_size": 49408
24
+ }
text_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57f6e3badaffb5713c93e1f34ac3abf2ee3cd48e60d01714a0a6ed33f3406a5a
3
+ size 492307041
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "do_lower_case": true,
12
+ "eos_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "errors": "replace",
21
+ "model_max_length": 77,
22
+ "name_or_path": "openai/clip-vit-large-patch14",
23
+ "pad_token": "<|endoftext|>",
24
+ "special_tokens_map_file": "./special_tokens_map.json",
25
+ "tokenizer_class": "CLIPTokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.20.0",
4
+ "act_fn": "silu",
5
+ "addition_embed_type": null,
6
+ "addition_embed_type_num_heads": 64,
7
+ "addition_time_embed_dim": null,
8
+ "attention_head_dim": 8,
9
+ "attention_type": "default",
10
+ "block_out_channels": [
11
+ 320,
12
+ 640,
13
+ 1280,
14
+ 1280
15
+ ],
16
+ "center_input_sample": false,
17
+ "class_embed_type": null,
18
+ "class_embeddings_concat": false,
19
+ "conv_in_kernel": 3,
20
+ "conv_out_kernel": 3,
21
+ "cross_attention_dim": 768,
22
+ "cross_attention_norm": null,
23
+ "down_block_types": [
24
+ "CrossAttnDownBlock2D",
25
+ "CrossAttnDownBlock2D",
26
+ "CrossAttnDownBlock2D",
27
+ "DownBlock2D"
28
+ ],
29
+ "downsample_padding": 1,
30
+ "dual_cross_attention": false,
31
+ "encoder_hid_dim": null,
32
+ "encoder_hid_dim_type": null,
33
+ "flip_sin_to_cos": true,
34
+ "freq_shift": 0,
35
+ "in_channels": 4,
36
+ "layers_per_block": 2,
37
+ "mid_block_only_cross_attention": null,
38
+ "mid_block_scale_factor": 1,
39
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
40
+ "norm_eps": 1e-05,
41
+ "norm_num_groups": 32,
42
+ "num_attention_heads": null,
43
+ "num_class_embeds": null,
44
+ "only_cross_attention": false,
45
+ "out_channels": 4,
46
+ "projection_class_embeddings_input_dim": null,
47
+ "resnet_out_scale_factor": 1.0,
48
+ "resnet_skip_time_act": false,
49
+ "resnet_time_scale_shift": "default",
50
+ "sample_size": 64,
51
+ "time_cond_proj_dim": null,
52
+ "time_embedding_act_fn": null,
53
+ "time_embedding_dim": null,
54
+ "time_embedding_type": "positional",
55
+ "timestep_post_act": null,
56
+ "transformer_layers_per_block": 1,
57
+ "up_block_types": [
58
+ "UpBlock2D",
59
+ "CrossAttnUpBlock2D",
60
+ "CrossAttnUpBlock2D",
61
+ "CrossAttnUpBlock2D"
62
+ ],
63
+ "upcast_attention": false,
64
+ "use_linear_projection": false
65
+ }
unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b64c917a491e553494fa0eb452824cc068296273a09436ca0803b06c42046c7d
3
+ size 3438366373
v1-inference.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-04
3
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.0120
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "jpg"
11
+ cond_stage_key: "txt"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false # Note: different from the one we trained before
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+
20
+ scheduler_config: # 10000 warmup steps
21
+ target: ldm.lr_scheduler.LambdaLinearScheduler
22
+ params:
23
+ warm_up_steps: [ 10000 ]
24
+ cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
25
+ f_start: [ 1.e-6 ]
26
+ f_max: [ 1. ]
27
+ f_min: [ 1. ]
28
+
29
+ unet_config:
30
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
31
+ params:
32
+ image_size: 32 # unused
33
+ in_channels: 4
34
+ out_channels: 4
35
+ model_channels: 320
36
+ attention_resolutions: [ 4, 2, 1 ]
37
+ num_res_blocks: 2
38
+ channel_mult: [ 1, 2, 4, 4 ]
39
+ num_heads: 8
40
+ use_spatial_transformer: True
41
+ transformer_depth: 1
42
+ context_dim: 768
43
+ use_checkpoint: True
44
+ legacy: False
45
+
46
+ first_stage_config:
47
+ target: ldm.models.autoencoder.AutoencoderKL
48
+ params:
49
+ embed_dim: 4
50
+ monitor: val/rec_loss
51
+ ddconfig:
52
+ double_z: true
53
+ z_channels: 4
54
+ resolution: 256
55
+ in_channels: 3
56
+ out_ch: 3
57
+ ch: 128
58
+ ch_mult:
59
+ - 1
60
+ - 2
61
+ - 4
62
+ - 4
63
+ num_res_blocks: 2
64
+ attn_resolutions: []
65
+ dropout: 0.0
66
+ lossconfig:
67
+ target: torch.nn.Identity
68
+
69
+ cond_stage_config:
70
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
vae/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.20.0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "layers_per_block": 2,
21
+ "norm_num_groups": 32,
22
+ "out_channels": 3,
23
+ "sample_size": 512,
24
+ "scaling_factor": 0.18215,
25
+ "up_block_types": [
26
+ "UpDecoderBlock2D",
27
+ "UpDecoderBlock2D",
28
+ "UpDecoderBlock2D",
29
+ "UpDecoderBlock2D"
30
+ ]
31
+ }
vae/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a425a89f2e522790b3975b93ed380814e68ec77a04841dced0832cad70eab929
3
+ size 334712113