models from Olive

Browse files

Files changed (11) hide show

README.md +11 -6
model_index.json +37 -0
scheduler/scheduler_config.json +21 -0
{ORT_CUDA/sd-turbo/engine/clip.ort_cuda.fp16 → text_encoder}/model.onnx +2 -2
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +30 -0
tokenizer/tokenizer_config.json +38 -0
tokenizer/vocab.json +0 -0
ORT_CUDA/sd-turbo/engine/unet.ort_cuda.fp16/model.onnx.data → unet/model.onnx +2 -2
{ORT_CUDA/sd-turbo/engine/unet.ort_cuda.fp16 → vae_decoder}/model.onnx +2 -2
{ORT_CUDA/sd-turbo/engine/vae.ort_cuda.fp16 → vae_encoder}/model.onnx +2 -2

README.md CHANGED Viewed

@@ -20,6 +20,11 @@ tags:
 This repository hosts the optimized versions of **SD Turbo** to accelerate inference with ONNX Runtime CUDA execution provider.
 See the [usage instructions](#usage-example) for how to run the SDXL pipeline with the ONNX files hosted in this repository.
 ## Model Description
@@ -37,10 +42,10 @@ Below is average latency of generating an image of size 512x512 using NVIDIA A10
 | Engine      | Batch Size | Steps | PyTorch 2.1     | ONNX Runtime CUDA |
 |-------------|------------|------ | ----------------|-------------------|
-| Static      | 1          |   1   | 85.3 ms         |  32.9 ms          |
-| Static      | 4          |   1   | 213.8 ms        |  97.5 ms          |
-| Static      | 1          |   4   | 117.4 ms        |  62.5 ms          |
-| Static      | 4          |   4   | 294.3 ms        | 168.3 ms          |
 Static means the engine is built for the given batch size and image size combination, and CUDA graph is used to speed up.
@@ -61,7 +66,7 @@ cd onnxruntime
 2. Download the SDXL ONNX files from this repo
 ```shell
 git lfs install
-git clone https://huggingface.co/tlwu/sdxl-turbo-onnxruntime
 ```
 3. Launch the docker
@@ -97,5 +102,5 @@ python3 -m pip install --upgrade polygraphy onnx-graphsurgeon --extra-index-url
 python3 demo_txt2img.py \
   "starry night over Golden Gate Bridge by van gogh" \
   --version sd-turbo   \
-  --work-dir /workspace/sd-turbo-onnxruntime
 ```

 This repository hosts the optimized versions of **SD Turbo** to accelerate inference with ONNX Runtime CUDA execution provider.
+The models are generated by [Olive](https://github.com/microsoft/Olive/tree/main/examples/stable_diffusion) with command like the following:
+```
+python stable_diffusion.py --provider cuda --model_id stabilityai/sd-turbo --optimize --use_fp16_fixed_vae
+```
 See the [usage instructions](#usage-example) for how to run the SDXL pipeline with the ONNX files hosted in this repository.
 ## Model Description
 | Engine      | Batch Size | Steps | PyTorch 2.1     | ONNX Runtime CUDA |
 |-------------|------------|------ | ----------------|-------------------|
+| Static      | 1          |   1   | 85.3 ms         |  38.2 ms          |
+| Static      | 4          |   1   | 213.8 ms        | 120.2 ms          |
+| Static      | 1          |   4   | 117.4 ms        |  68.7 ms          |
+| Static      | 4          |   4   | 294.3 ms        | 192.6 ms          |
 Static means the engine is built for the given batch size and image size combination, and CUDA graph is used to speed up.
 2. Download the SDXL ONNX files from this repo
 ```shell
 git lfs install
+git clone https://huggingface.co/tlwu/sd-turbo-onnxruntime
 ```
 3. Launch the docker
 python3 demo_txt2img.py \
   "starry night over Golden Gate Bridge by van gogh" \
   --version sd-turbo   \
+  --engine-dir /workspace/sd-turbo-onnxruntime
 ```

model_index.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_class_name": "OnnxStableDiffusionPipeline",
+  "_diffusers_version": "0.24.0",
+  "feature_extractor": [
+    null,
+    null
+  ],
+  "requires_safety_checker": true,
+  "safety_checker": [
+    null,
+    null
+  ],
+  "scheduler": [
+    "diffusers",
+    "EulerDiscreteScheduler"
+  ],
+  "text_encoder": [
+    "diffusers",
+    "OnnxRuntimeModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "OnnxRuntimeModel"
+  ],
+  "vae_decoder": [
+    "diffusers",
+    "OnnxRuntimeModel"
+  ],
+  "vae_encoder": [
+    "diffusers",
+    "OnnxRuntimeModel"
+  ]
+}

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "_class_name": "EulerDiscreteScheduler",
+  "_diffusers_version": "0.24.0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "interpolation_type": "linear",
+  "num_train_timesteps": 1000,
+  "prediction_type": "epsilon",
+  "sample_max_value": 1.0,
+  "set_alpha_to_one": false,
+  "sigma_max": null,
+  "sigma_min": null,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "timestep_spacing": "trailing",
+  "timestep_type": "discrete",
+  "trained_betas": null,
+  "use_karras_sigmas": false
+}

{ORT_CUDA/sd-turbo/engine/clip.ort_cuda.fp16 → text_encoder}/model.onnx RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5f1b1d827fb6013c67ac3f349b06d1a159373c2faf5253555e20b14ce2ebaacf
-size 680852028

 version https://git-lfs.github.com/spec/v1
+oid sha256:a0d07fcce1858783b635c4027a562c89690d7c9f3f36c6129f1aa35af380f3d8
+size 680854339

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "!",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "!",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49406": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49407": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "!",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": "<|endoftext|>"
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ORT_CUDA/sd-turbo/engine/unet.ort_cuda.fp16/model.onnx.data → unet/model.onnx RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:402069ca12429e3f7b810770a49f0121a3f0f025a38317ebfd4b956d7de1c41e
-size 1732024320

 version https://git-lfs.github.com/spec/v1
+oid sha256:3b1170b54ca0ceafe1c709cd0268bd98892928e5589a7d72aec817669472beda
+size 1732410827

{ORT_CUDA/sd-turbo/engine/unet.ort_cuda.fp16 → vae_decoder}/model.onnx RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf77e0fc1a30bd77cbe86ddefccaadc30e2dc3a667c92418f3811f5417ce2af1
-size 371766

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b5a9263fbd3ebd2647bae8e99cc99fdcff70a0b1b6a77ca7544c5f3e9e91649
+size 99072656

{ORT_CUDA/sd-turbo/engine/vae.ort_cuda.fp16 → vae_encoder}/model.onnx RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a72b771027dde10c6bb0af1347c9e30b0df54f186f3ff7c688f2b89354bcc7a2
-size 99070385

 version https://git-lfs.github.com/spec/v1
+oid sha256:dac0a94d703ba0ed900efaadc9d6e8dc4e9f1977d0f87f050aa05cb40d7c1638
+size 68412355