eykarim commited on
Commit
ea8fb12
·
1 Parent(s): f8c8561

Upload 21 files

Browse files
README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: creativeml-openrail-m
3
+ tags:
4
+ - stable-diffusion
5
+ - stable-diffusion-diffusers
6
+ - text-to-image
7
+ - endpoints-template
8
+ inference: false
9
+ ---
10
+
11
+ # Fork of [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4)
12
+
13
+ > Stable Diffusion is a latent text-to-image diffusion model capable of generating photo-realistic images given any text input.
14
+ > For more information about how Stable Diffusion functions, please have a look at [🤗's Stable Diffusion with 🧨Diffusers blog](https://huggingface.co/blog/stable_diffusion).
15
+
16
+ For more information about the model, license and limitations check the original model card at [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4).
17
+
18
+ ### License (CreativeML OpenRAIL-M)
19
+
20
+ The full license can be found here: https://huggingface.co/spaces/CompVis/stable-diffusion-license
21
+
22
+ ---
23
+
24
+ This repository implements a custom `handler` task for `text-to-image` for 🤗 Inference Endpoints. The code for the customized pipeline is in the [pipeline.py](https://huggingface.co/philschmid/stable-diffusion-v1-4-endpoints/blob/main/handler.py).
25
+
26
+ There is also a [notebook](https://huggingface.co/philschmid/stable-diffusion-v1-4-endpoints/blob/main/create_handler.ipynb) included, on how to create the `handler.py`
27
+
28
+ ### expected Request payload
29
+ ```json
30
+ {
31
+ "inputs": "A prompt used for image generation"
32
+ }
33
+ ```
34
+
35
+ below is an example on how to run a request using Python and `requests`.
36
+
37
+ ## Run Request
38
+ ```python
39
+ import json
40
+ from typing import List
41
+ import requests as r
42
+ import base64
43
+ from PIL import Image
44
+ from io import BytesIO
45
+
46
+ ENDPOINT_URL = ""
47
+ HF_TOKEN = ""
48
+
49
+ # helper decoder
50
+ def decode_base64_image(image_string):
51
+ base64_image = base64.b64decode(image_string)
52
+ buffer = BytesIO(base64_image)
53
+ return Image.open(buffer)
54
+
55
+
56
+ def predict(prompt:str=None):
57
+ payload = {"inputs": code_snippet,"parameters": parameters}
58
+ response = r.post(
59
+ ENDPOINT_URL, headers={"Authorization": f"Bearer {HF_TOKEN}"}, json={"inputs": prompt}
60
+ )
61
+ resp = response.json()
62
+ return decode_base64_image(resp["image"])
63
+
64
+ prediction = predict(
65
+ prompt="the first animal on the mars"
66
+ )
67
+ ```
68
+ expected output
69
+
70
+ ![sample](sample.jpg)
create_handler.ipynb ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## Setup & Installation"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stdout",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "Overwriting requirements.txt\n"
20
+ ]
21
+ }
22
+ ],
23
+ "source": [
24
+ "%%writefile requirements.txt\n",
25
+ "diffusers==0.2.4"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": null,
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "!pip install -r requirements.txt --upgrade"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "markdown",
39
+ "metadata": {},
40
+ "source": [
41
+ "## 3. Create Custom Handler for Inference Endpoints\n"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 10,
47
+ "metadata": {},
48
+ "outputs": [
49
+ {
50
+ "data": {
51
+ "text/plain": [
52
+ "device(type='cuda')"
53
+ ]
54
+ },
55
+ "execution_count": 10,
56
+ "metadata": {},
57
+ "output_type": "execute_result"
58
+ }
59
+ ],
60
+ "source": [
61
+ "import torch\n",
62
+ "\n",
63
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
64
+ "device"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 11,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "if device.type != 'cuda':\n",
74
+ " raise ValueError(\"need to run on GPU\")"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 5,
80
+ "metadata": {},
81
+ "outputs": [
82
+ {
83
+ "name": "stdout",
84
+ "output_type": "stream",
85
+ "text": [
86
+ "Overwriting handler.py\n"
87
+ ]
88
+ }
89
+ ],
90
+ "source": [
91
+ "%%writefile handler.py\n",
92
+ "from typing import Dict, List, Any\n",
93
+ "import torch\n",
94
+ "from torch import autocast\n",
95
+ "from diffusers import StableDiffusionPipeline\n",
96
+ "import base64\n",
97
+ "from io import BytesIO\n",
98
+ "\n",
99
+ "\n",
100
+ "# set device\n",
101
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
102
+ "\n",
103
+ "if device.type != 'cuda':\n",
104
+ " raise ValueError(\"need to run on GPU\")\n",
105
+ "\n",
106
+ "class EndpointHandler():\n",
107
+ " def __init__(self, path=\"\"):\n",
108
+ " # load the optimized model\n",
109
+ " self.pipe = StableDiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16)\n",
110
+ " self.pipe = self.pipe.to(device)\n",
111
+ "\n",
112
+ "\n",
113
+ " def __call__(self, data: Any) -> List[List[Dict[str, float]]]:\n",
114
+ " \"\"\"\n",
115
+ " Args:\n",
116
+ " data (:obj:):\n",
117
+ " includes the input data and the parameters for the inference.\n",
118
+ " Return:\n",
119
+ " A :obj:`dict`:. base64 encoded image\n",
120
+ " \"\"\"\n",
121
+ " inputs = data.pop(\"inputs\", data)\n",
122
+ " \n",
123
+ " # run inference pipeline\n",
124
+ " with autocast(device.type):\n",
125
+ " image = self.pipe(inputs, guidance_scale=7.5)[\"sample\"][0] \n",
126
+ " \n",
127
+ " # encode image as base 64\n",
128
+ " buffered = BytesIO()\n",
129
+ " image.save(buffered, format=\"JPEG\")\n",
130
+ " img_str = base64.b64encode(buffered.getvalue())\n",
131
+ "\n",
132
+ " # postprocess the prediction\n",
133
+ " return {\"image\": img_str.decode()}"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "markdown",
138
+ "metadata": {},
139
+ "source": [
140
+ "test custom pipeline"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 6,
146
+ "metadata": {},
147
+ "outputs": [
148
+ {
149
+ "data": {
150
+ "text/plain": [
151
+ "'1.11.0+cu113'"
152
+ ]
153
+ },
154
+ "execution_count": 6,
155
+ "metadata": {},
156
+ "output_type": "execute_result"
157
+ }
158
+ ],
159
+ "source": [
160
+ "import torch\n",
161
+ "\n",
162
+ "torch.__version__"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": 1,
168
+ "metadata": {},
169
+ "outputs": [
170
+ {
171
+ "name": "stderr",
172
+ "output_type": "stream",
173
+ "text": [
174
+ "ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.\n"
175
+ ]
176
+ }
177
+ ],
178
+ "source": [
179
+ "from handler import EndpointHandler\n",
180
+ "\n",
181
+ "# init handler\n",
182
+ "my_handler = EndpointHandler(path=\".\")"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 6,
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "application/vnd.jupyter.widget-view+json": {
193
+ "model_id": "376de150f16b4b4bb0c3ab8c513de5c0",
194
+ "version_major": 2,
195
+ "version_minor": 0
196
+ },
197
+ "text/plain": [
198
+ "0it [00:00, ?it/s]"
199
+ ]
200
+ },
201
+ "metadata": {},
202
+ "output_type": "display_data"
203
+ }
204
+ ],
205
+ "source": [
206
+ "import base64\n",
207
+ "from PIL import Image\n",
208
+ "from io import BytesIO\n",
209
+ "import json\n",
210
+ "\n",
211
+ "# helper decoder\n",
212
+ "def decode_base64_image(image_string):\n",
213
+ " base64_image = base64.b64decode(image_string)\n",
214
+ " buffer = BytesIO(base64_image)\n",
215
+ " return Image.open(buffer)\n",
216
+ "\n",
217
+ "# prepare sample payload\n",
218
+ "request = {\"inputs\": \"a high resulotion image of a macbook\"}\n",
219
+ "\n",
220
+ "# test the handler\n",
221
+ "pred = my_handler(request)"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type": "code",
226
+ "execution_count": 4,
227
+ "metadata": {},
228
+ "outputs": [],
229
+ "source": [
230
+ "decode_base64_image(pred[\"image\"]).save(\"sample.jpg\")"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "markdown",
235
+ "metadata": {},
236
+ "source": [
237
+ "![test](sample.jpg)"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type": "code",
242
+ "execution_count": null,
243
+ "metadata": {},
244
+ "outputs": [],
245
+ "source": []
246
+ }
247
+ ],
248
+ "metadata": {
249
+ "kernelspec": {
250
+ "display_name": "Python 3.9.13 ('dev': conda)",
251
+ "language": "python",
252
+ "name": "python3"
253
+ },
254
+ "language_info": {
255
+ "codemirror_mode": {
256
+ "name": "ipython",
257
+ "version": 3
258
+ },
259
+ "file_extension": ".py",
260
+ "mimetype": "text/x-python",
261
+ "name": "python",
262
+ "nbconvert_exporter": "python",
263
+ "pygments_lexer": "ipython3",
264
+ "version": "3.9.13"
265
+ },
266
+ "orig_nbformat": 4,
267
+ "vscode": {
268
+ "interpreter": {
269
+ "hash": "f6dd96c16031089903d5a31ec148b80aeb0d39c32affb1a1080393235fbfa2fc"
270
+ }
271
+ }
272
+ },
273
+ "nbformat": 4,
274
+ "nbformat_minor": 2
275
+ }
feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 224,
3
+ "do_center_crop": true,
4
+ "do_convert_rgb": true,
5
+ "do_normalize": true,
6
+ "do_resize": true,
7
+ "feature_extractor_type": "CLIPFeatureExtractor",
8
+ "image_mean": [
9
+ 0.48145466,
10
+ 0.4578275,
11
+ 0.40821073
12
+ ],
13
+ "image_std": [
14
+ 0.26862954,
15
+ 0.26130258,
16
+ 0.27577711
17
+ ],
18
+ "resample": 3,
19
+ "size": 224
20
+ }
handler.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ import torch
3
+ from torch import autocast
4
+ from diffusers import StableDiffusionPipeline
5
+ import base64
6
+ from io import BytesIO
7
+
8
+
9
+ # set device
10
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
11
+
12
+ if device.type != 'cuda':
13
+ raise ValueError("need to run on GPU")
14
+
15
+ class EndpointHandler():
16
+ def __init__(self, path=""):
17
+ # load the optimized model
18
+ self.pipe = StableDiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16)
19
+ self.pipe = self.pipe.to(device)
20
+
21
+
22
+ def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
23
+ """
24
+ Args:
25
+ data (:obj:):
26
+ includes the input data and the parameters for the inference.
27
+ Return:
28
+ A :obj:`dict`:. base64 encoded image
29
+ """
30
+ inputs = data.pop("inputs", data)
31
+
32
+ # run inference pipeline
33
+ with autocast(device.type):
34
+ image = self.pipe(inputs, guidance_scale=7.5)["sample"][0]
35
+
36
+ # encode image as base 64
37
+ buffered = BytesIO()
38
+ image.save(buffered, format="JPEG")
39
+ img_str = base64.b64encode(buffered.getvalue())
40
+
41
+ # postprocess the prediction
42
+ return {"image": img_str.decode()}
model_index.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableDiffusionPipeline",
3
+ "_diffusers_version": "0.2.3",
4
+ "feature_extractor": [
5
+ "transformers",
6
+ "CLIPFeatureExtractor"
7
+ ],
8
+ "safety_checker": [
9
+ "stable_diffusion",
10
+ "StableDiffusionSafetyChecker"
11
+ ],
12
+ "scheduler": [
13
+ "diffusers",
14
+ "PNDMScheduler"
15
+ ],
16
+ "text_encoder": [
17
+ "transformers",
18
+ "CLIPTextModel"
19
+ ],
20
+ "tokenizer": [
21
+ "transformers",
22
+ "CLIPTokenizer"
23
+ ],
24
+ "unet": [
25
+ "diffusers",
26
+ "UNet2DConditionModel"
27
+ ],
28
+ "vae": [
29
+ "diffusers",
30
+ "AutoencoderKL"
31
+ ]
32
+ }
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ diffusers==0.2.4
safety_checker/config.json ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./safety_checker",
3
+ "architectures": [
4
+ "StableDiffusionSafetyChecker"
5
+ ],
6
+ "initializer_factor": 1.0,
7
+ "logit_scale_init_value": 2.6592,
8
+ "model_type": "clip",
9
+ "projection_dim": 768,
10
+ "text_config": {
11
+ "_name_or_path": "",
12
+ "add_cross_attention": false,
13
+ "architectures": null,
14
+ "attention_dropout": 0.0,
15
+ "bad_words_ids": null,
16
+ "bos_token_id": 0,
17
+ "chunk_size_feed_forward": 0,
18
+ "cross_attention_hidden_size": null,
19
+ "decoder_start_token_id": null,
20
+ "diversity_penalty": 0.0,
21
+ "do_sample": false,
22
+ "dropout": 0.0,
23
+ "early_stopping": false,
24
+ "encoder_no_repeat_ngram_size": 0,
25
+ "eos_token_id": 2,
26
+ "exponential_decay_length_penalty": null,
27
+ "finetuning_task": null,
28
+ "forced_bos_token_id": null,
29
+ "forced_eos_token_id": null,
30
+ "hidden_act": "quick_gelu",
31
+ "hidden_size": 768,
32
+ "id2label": {
33
+ "0": "LABEL_0",
34
+ "1": "LABEL_1"
35
+ },
36
+ "initializer_factor": 1.0,
37
+ "initializer_range": 0.02,
38
+ "intermediate_size": 3072,
39
+ "is_decoder": false,
40
+ "is_encoder_decoder": false,
41
+ "label2id": {
42
+ "LABEL_0": 0,
43
+ "LABEL_1": 1
44
+ },
45
+ "layer_norm_eps": 1e-05,
46
+ "length_penalty": 1.0,
47
+ "max_length": 20,
48
+ "max_position_embeddings": 77,
49
+ "min_length": 0,
50
+ "model_type": "clip_text_model",
51
+ "no_repeat_ngram_size": 0,
52
+ "num_attention_heads": 12,
53
+ "num_beam_groups": 1,
54
+ "num_beams": 1,
55
+ "num_hidden_layers": 12,
56
+ "num_return_sequences": 1,
57
+ "output_attentions": false,
58
+ "output_hidden_states": false,
59
+ "output_scores": false,
60
+ "pad_token_id": 1,
61
+ "prefix": null,
62
+ "problem_type": null,
63
+ "pruned_heads": {},
64
+ "remove_invalid_values": false,
65
+ "repetition_penalty": 1.0,
66
+ "return_dict": true,
67
+ "return_dict_in_generate": false,
68
+ "sep_token_id": null,
69
+ "task_specific_params": null,
70
+ "temperature": 1.0,
71
+ "tf_legacy_loss": false,
72
+ "tie_encoder_decoder": false,
73
+ "tie_word_embeddings": true,
74
+ "tokenizer_class": null,
75
+ "top_k": 50,
76
+ "top_p": 1.0,
77
+ "torch_dtype": null,
78
+ "torchscript": false,
79
+ "transformers_version": "4.21.1",
80
+ "typical_p": 1.0,
81
+ "use_bfloat16": false,
82
+ "vocab_size": 49408
83
+ },
84
+ "text_config_dict": {
85
+ "hidden_size": 768,
86
+ "intermediate_size": 3072,
87
+ "num_attention_heads": 12,
88
+ "num_hidden_layers": 12
89
+ },
90
+ "torch_dtype": "float16",
91
+ "transformers_version": null,
92
+ "vision_config": {
93
+ "_name_or_path": "",
94
+ "add_cross_attention": false,
95
+ "architectures": null,
96
+ "attention_dropout": 0.0,
97
+ "bad_words_ids": null,
98
+ "bos_token_id": null,
99
+ "chunk_size_feed_forward": 0,
100
+ "cross_attention_hidden_size": null,
101
+ "decoder_start_token_id": null,
102
+ "diversity_penalty": 0.0,
103
+ "do_sample": false,
104
+ "dropout": 0.0,
105
+ "early_stopping": false,
106
+ "encoder_no_repeat_ngram_size": 0,
107
+ "eos_token_id": null,
108
+ "exponential_decay_length_penalty": null,
109
+ "finetuning_task": null,
110
+ "forced_bos_token_id": null,
111
+ "forced_eos_token_id": null,
112
+ "hidden_act": "quick_gelu",
113
+ "hidden_size": 1024,
114
+ "id2label": {
115
+ "0": "LABEL_0",
116
+ "1": "LABEL_1"
117
+ },
118
+ "image_size": 224,
119
+ "initializer_factor": 1.0,
120
+ "initializer_range": 0.02,
121
+ "intermediate_size": 4096,
122
+ "is_decoder": false,
123
+ "is_encoder_decoder": false,
124
+ "label2id": {
125
+ "LABEL_0": 0,
126
+ "LABEL_1": 1
127
+ },
128
+ "layer_norm_eps": 1e-05,
129
+ "length_penalty": 1.0,
130
+ "max_length": 20,
131
+ "min_length": 0,
132
+ "model_type": "clip_vision_model",
133
+ "no_repeat_ngram_size": 0,
134
+ "num_attention_heads": 16,
135
+ "num_beam_groups": 1,
136
+ "num_beams": 1,
137
+ "num_channels": 3,
138
+ "num_hidden_layers": 24,
139
+ "num_return_sequences": 1,
140
+ "output_attentions": false,
141
+ "output_hidden_states": false,
142
+ "output_scores": false,
143
+ "pad_token_id": null,
144
+ "patch_size": 14,
145
+ "prefix": null,
146
+ "problem_type": null,
147
+ "pruned_heads": {},
148
+ "remove_invalid_values": false,
149
+ "repetition_penalty": 1.0,
150
+ "return_dict": true,
151
+ "return_dict_in_generate": false,
152
+ "sep_token_id": null,
153
+ "task_specific_params": null,
154
+ "temperature": 1.0,
155
+ "tf_legacy_loss": false,
156
+ "tie_encoder_decoder": false,
157
+ "tie_word_embeddings": true,
158
+ "tokenizer_class": null,
159
+ "top_k": 50,
160
+ "top_p": 1.0,
161
+ "torch_dtype": null,
162
+ "torchscript": false,
163
+ "transformers_version": "4.21.1",
164
+ "typical_p": 1.0,
165
+ "use_bfloat16": false
166
+ },
167
+ "vision_config_dict": {
168
+ "hidden_size": 1024,
169
+ "intermediate_size": 4096,
170
+ "num_attention_heads": 16,
171
+ "num_hidden_layers": 24,
172
+ "patch_size": 14
173
+ }
174
+ }
safety_checker/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d37ca6e57ace94e4c2f03ed0f67b6dc83e1ef1160892074917aa68b28e2afc1
3
+ size 608098599
sample.jpg ADDED
scheduler/.ipynb_checkpoints/scheduler_config-checkpoint.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PNDMScheduler",
3
+ "_diffusers_version": "0.2.2",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "num_train_timesteps": 1000,
8
+ "skip_prk_steps": true
9
+ }
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PNDMScheduler",
3
+ "_diffusers_version": "0.2.3",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "num_train_timesteps": 1000,
8
+ "skip_prk_steps": true
9
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./text_encoder",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "quick_gelu",
11
+ "hidden_size": 768,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "torch_dtype": "float16",
22
+ "transformers_version": "4.21.1",
23
+ "vocab_size": 49408
24
+ }
text_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88bd85efb0f84e70521633f578715afb2873db4f2615fdfb1f66e99934715865
3
+ size 246184375
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "do_lower_case": true,
12
+ "eos_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "errors": "replace",
21
+ "model_max_length": 77,
22
+ "name_or_path": "./tokenizer",
23
+ "pad_token": "<|endoftext|>",
24
+ "special_tokens_map_file": "./special_tokens_map.json",
25
+ "tokenizer_class": "CLIPTokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.2.3",
4
+ "_name_or_path": "./unet",
5
+ "act_fn": "silu",
6
+ "attention_head_dim": 8,
7
+ "block_out_channels": [
8
+ 320,
9
+ 640,
10
+ 1280,
11
+ 1280
12
+ ],
13
+ "center_input_sample": false,
14
+ "cross_attention_dim": 768,
15
+ "down_block_types": [
16
+ "CrossAttnDownBlock2D",
17
+ "CrossAttnDownBlock2D",
18
+ "CrossAttnDownBlock2D",
19
+ "DownBlock2D"
20
+ ],
21
+ "downsample_padding": 1,
22
+ "flip_sin_to_cos": true,
23
+ "freq_shift": 0,
24
+ "in_channels": 4,
25
+ "layers_per_block": 2,
26
+ "mid_block_scale_factor": 1,
27
+ "norm_eps": 1e-05,
28
+ "norm_num_groups": 32,
29
+ "out_channels": 4,
30
+ "sample_size": 64,
31
+ "up_block_types": [
32
+ "UpBlock2D",
33
+ "CrossAttnUpBlock2D",
34
+ "CrossAttnUpBlock2D",
35
+ "CrossAttnUpBlock2D"
36
+ ]
37
+ }
unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d98edd280d5e040ee77f5802b8e3be3513de757335d1dedc4e495647e7c2d573
3
+ size 1719312805
vae/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.2.3",
4
+ "_name_or_path": "./vae",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "layers_per_block": 2,
21
+ "out_channels": 3,
22
+ "sample_size": 512,
23
+ "up_block_types": [
24
+ "UpDecoderBlock2D",
25
+ "UpDecoderBlock2D",
26
+ "UpDecoderBlock2D",
27
+ "UpDecoderBlock2D"
28
+ ]
29
+ }
vae/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c8904bc921e1e6f354b5fa8e99a1c82ead2f0540114de21557b8abfbb24ad0
3
+ size 167399505