Spaces:
Running
Running
Commit
·
31379ae
1
Parent(s):
ceca31f
Update app.py
Browse files
app.py
CHANGED
@@ -24,28 +24,28 @@ def display_image(image=None,width=500,height=500):
|
|
24 |
# API Gateway endpoint URL
|
25 |
api_url = 'https://a02q342s5b.execute-api.us-east-2.amazonaws.com/reinvent-demo-inf2-sm-20231114'
|
26 |
|
27 |
-
# Define the CSS to change the text input background color
|
28 |
-
input_field_style = """
|
29 |
-
<style>
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
</style>
|
41 |
-
"""
|
42 |
|
43 |
-
# Inject custom styles into the Streamlit app
|
44 |
-
st.markdown(input_field_style, unsafe_allow_html=True)
|
45 |
|
46 |
|
47 |
# Creating Tabs
|
48 |
-
tab1, tab2 = st.tabs(["Image Generation", "Architecture"])
|
49 |
|
50 |
with tab1:
|
51 |
# Create two columns for layout
|
@@ -122,10 +122,527 @@ with tab1:
|
|
122 |
|
123 |
with tab2:
|
124 |
# ===========
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
# API Gateway endpoint URL
|
25 |
api_url = 'https://a02q342s5b.execute-api.us-east-2.amazonaws.com/reinvent-demo-inf2-sm-20231114'
|
26 |
|
27 |
+
# # Define the CSS to change the text input background color
|
28 |
+
# input_field_style = """
|
29 |
+
# <style>
|
30 |
+
# /* Customize the text input field background and text color */
|
31 |
+
# .stTextInput input {
|
32 |
+
# background-color: #fbd8bf; /* 'Rind' color */
|
33 |
+
# color: #232F3E; /* Dark text color */
|
34 |
+
# }
|
35 |
+
# /* You might also want to change the color for textarea if you're using it */
|
36 |
+
# .stTextArea textarea {
|
37 |
+
# background-color: #fbd8bf; /* 'Rind' color */
|
38 |
+
# color: #232F3E; /* Dark text color */
|
39 |
+
# }
|
40 |
+
# </style>
|
41 |
+
# """
|
42 |
|
43 |
+
# # Inject custom styles into the Streamlit app
|
44 |
+
# st.markdown(input_field_style, unsafe_allow_html=True)
|
45 |
|
46 |
|
47 |
# Creating Tabs
|
48 |
+
tab1, tab2, tab3 = st.tabs(["Image Generation", "Architecture", "Code"])
|
49 |
|
50 |
with tab1:
|
51 |
# Create two columns for layout
|
|
|
122 |
|
123 |
with tab2:
|
124 |
# ===========
|
125 |
+
left_column, _, right_column = st.columns([2,.2,3])
|
126 |
+
|
127 |
+
with right_column:
|
128 |
+
# Define Streamlit UI elements
|
129 |
+
st.markdown("""<br>""", unsafe_allow_html=True)
|
130 |
+
st.markdown("""<br>""", unsafe_allow_html=True)
|
131 |
+
st.markdown("""<br>""", unsafe_allow_html=True)
|
132 |
+
st.markdown("""<br>""", unsafe_allow_html=True)
|
133 |
+
st.markdown("""<br>""", unsafe_allow_html=True)
|
134 |
+
st.image('./architecture.png', caption=f"Application Architecture")
|
135 |
+
|
136 |
+
with left_column:
|
137 |
+
st.write("## Architecture Overview")
|
138 |
+
st.write("This diagram illustrates the architecture of our Generative AI service, which is composed of several interconnected AWS services, notable Amazon Elastic Compute Cloud (Amazon EC2). Here's a detailed look at each component:")
|
139 |
+
|
140 |
+
with st.expander("(1) Inference Models"):
|
141 |
+
st.markdown("""
|
142 |
+
- The architecture starts with our trained machine learning models hosted on Amazon SageMaker, running on AWS Inferentia 2 instance (`inf2.xlarge`).
|
143 |
+
- There are two models shown here, Stable Diffusion XL for image generation, and Llama 2 7B for text generation.
|
144 |
+
""")
|
145 |
+
|
146 |
+
with st.expander("(2) Amazon SageMaker Endpoints"):
|
147 |
+
st.markdown("""
|
148 |
+
- The models are exposed via SageMaker Endpoints, which provide scalable and secure real-time inference services.
|
149 |
+
- These endpoints are the interfaces through which the models receive input data and return predictions.
|
150 |
+
""")
|
151 |
+
|
152 |
+
with st.expander("(3) AWS Lambda"):
|
153 |
+
st.markdown("""
|
154 |
+
- AWS Lambda functions serve as the middle layer, handling the logic of communicating with the SageMaker Endpoints.
|
155 |
+
- Lambda can process the incoming requests, perform any necessary transformations, call the endpoints, and then process the results before sending them back.
|
156 |
+
""")
|
157 |
+
|
158 |
+
with st.expander("(4) Amazon API Gateway"):
|
159 |
+
st.markdown("""
|
160 |
+
- The processed results from Lambda are then routed through Amazon API Gateway.
|
161 |
+
- API Gateway acts as a front door to manage all incoming API requests, including authorization, throttling, and CORS handling.
|
162 |
+
""")
|
163 |
+
|
164 |
+
with st.expander("(5) Streamlit Frontend"):
|
165 |
+
st.markdown("""
|
166 |
+
- Finally, our Streamlit application provides a user-friendly interface for end-users to interact with the service.
|
167 |
+
- It sends requests to the API Gateway and displays the returned predictions from the machine learning models.
|
168 |
+
""")
|
169 |
+
|
170 |
+
st.write("""
|
171 |
+
In summary, this architecture enables a scalable, serverless, and responsive Generative AI service that can serve real-time predictions to users directly from a web interface.
|
172 |
+
""")
|
173 |
+
|
174 |
+
with tab3:
|
175 |
+
with st.expander("(1) Deploy GenAI Model to AWS Inferentia 2 Instance and Amazon SageMaker Endpoint"):
|
176 |
+
st.markdown(
|
177 |
+
"""
|
178 |
+
[Source] This code is modified from this fantastic blog by Phil Schmid at HuggingFace: https://www.philschmid.de/inferentia2-stable-diffusion-xl
|
179 |
+
|
180 |
+
# Deploy Stable Diffusion on AWS inferentia2 with Amazon SageMaker
|
181 |
+
|
182 |
+
In this end-to-end tutorial, you will learn how to deploy and speed up Stable Diffusion XL inference using AWS Inferentia2 and [optimum-neuron](https://huggingface.co/docs/optimum-neuron/index) on Amazon SageMaker. [Optimum Neuron](https://huggingface.co/docs/optimum-neuron/index) is the interface between the Hugging Face Transformers & Diffusers library and AWS Accelerators including AWS Trainium and AWS Inferentia2.
|
183 |
+
|
184 |
+
You will learn how to:
|
185 |
+
|
186 |
+
1. Convert Stable Diffusion XL to AWS Neuron (Inferentia2) with `optimum-neuron`
|
187 |
+
2. Create a custom `inference.py` script for Stable Diffusion
|
188 |
+
3. Upload the neuron model and inference script to Amazon S3
|
189 |
+
4. Deploy a Real-time Inference Endpoint on Amazon SageMaker
|
190 |
+
5. Generate images using the deployed model
|
191 |
+
|
192 |
+
## Quick intro: AWS Inferentia 2
|
193 |
+
|
194 |
+
[AWS inferentia (Inf2)](https://aws.amazon.com/de/ec2/instance-types/inf2/) are purpose-built EC2 for deep learning (DL) inference workloads. Inferentia 2 is the successor of [AWS Inferentia](https://aws.amazon.com/ec2/instance-types/inf1/?nc1=h_ls), which promises to deliver up to 4x higher throughput and up to 10x lower latency.
|
195 |
+
|
196 |
+
| instance size | accelerators | Neuron Cores | accelerator memory | vCPU | CPU Memory | on-demand price ($/h) |
|
197 |
+
| ------------- | ------------ | ------------ | ------------------ | ---- | ---------- | --------------------- |
|
198 |
+
| inf2.xlarge | 1 | 2 | 32 | 4 | 16 | 0.76 |
|
199 |
+
| inf2.8xlarge | 1 | 2 | 32 | 32 | 128 | 1.97 |
|
200 |
+
| inf2.24xlarge | 6 | 12 | 192 | 96 | 384 | 6.49 |
|
201 |
+
| inf2.48xlarge | 12 | 24 | 384 | 192 | 768 | 12.98 |
|
202 |
+
|
203 |
+
Additionally, inferentia 2 will support the writing of custom operators in c++ and new datatypes, including `FP8` (cFP8).
|
204 |
+
|
205 |
+
Let's get started! 🚀
|
206 |
+
|
207 |
+
*If you are going to use Sagemaker in a local environment (not SageMaker Studio or Notebook Instances). You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it.*
|
208 |
+
|
209 |
+
## 1. Convert Stable Diffusion to AWS Neuron (Inferentia2) with `optimum-neuron`
|
210 |
+
|
211 |
+
We are going to use the [optimum-neuron](https://huggingface.co/docs/optimum-neuron/index) to compile/convert our model to neuronx. Optimum Neuron provides a set of tools enabling easy model loading, training and inference on single- and multi-Accelerator settings for different downstream tasks.
|
212 |
+
|
213 |
+
As a first step, we need to install the `optimum-neuron` and other required packages.
|
214 |
+
|
215 |
+
*Tip: If you are using Amazon SageMaker Notebook Instances or Studio you can go with the `conda_python3` conda kernel.*
|
216 |
+
|
217 |
+
|
218 |
+
|
219 |
+
```python
|
220 |
+
# Install the required packages
|
221 |
+
%pip install "optimum-neuron==0.0.13" "diffusers==0.21.4" --upgrade
|
222 |
+
%pip install "sagemaker>=2.197.0" --upgrade
|
223 |
+
```
|
224 |
+
|
225 |
+
After we have installed the `optimum-neuron` we can convert load and convert our model.
|
226 |
+
|
227 |
+
We are going to use the [stabilityai/stable-diffusion-xl-base-1.0](hstabilityai/stable-diffusion-xl-base-1.0) model. Stable Diffusion XL (SDXL) from [Stability AI](https://stability.ai/) is the newset text-to-image generation model, which can create photorealistic images with detailed imagery and composition compared to previous SD models, including SD 2.1.
|
228 |
+
|
229 |
+
At the time of writing, the [AWS Inferentia2 does not support dynamic shapes for inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-features/dynamic-shapes.html?highlight=dynamic%20shapes#), which means that the we need to specify our image size in advanced for compiling and inference.
|
230 |
+
|
231 |
+
In simpler terms, this means we need to define the input shapes for our prompt (sequence length), batch size, height and width of the image.
|
232 |
+
|
233 |
+
We precompiled the model with the following parameters and pushed it to the Hugging Face Hub:
|
234 |
+
* `height`: 1024
|
235 |
+
* `width`: 1024
|
236 |
+
* `sequence_length`: 128
|
237 |
+
* `num_images_per_prompt`: 1
|
238 |
+
* `batch_size`: 1
|
239 |
+
* `neuron`: 2.15.0
|
240 |
+
|
241 |
+
|
242 |
+
_Note: If you want to compile your own model or a different Stable Diffusion XL checkpoint you need to use ~120GB of memory and the compilation can take ~45 minutes. We used an `inf2.8xlarge` ec2 instance with the [Hugging Face Neuron Deep Learning AMI](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) to compile the model._
|
243 |
+
|
244 |
+
|
245 |
+
```python
|
246 |
+
from huggingface_hub import snapshot_download
|
247 |
+
|
248 |
+
# compiled model id
|
249 |
+
compiled_model_id = "aws-neuron/stable-diffusion-xl-base-1-0-1024x1024"
|
250 |
+
|
251 |
+
# save compiled model to local directory
|
252 |
+
save_directory = "sdxl_neuron"
|
253 |
+
# Downloads our compiled model from the HuggingFace Hub
|
254 |
+
# using the revision as neuron version reference
|
255 |
+
# and makes sure we exlcude the symlink files and "hidden" files, like .DS_Store, .gitignore, etc.
|
256 |
+
snapshot_download(compiled_model_id, revision="2.15.0", local_dir=save_directory, local_dir_use_symlinks=False, allow_patterns=["[!.]*.*"])
|
257 |
+
|
258 |
+
|
259 |
+
###############################################
|
260 |
+
# COMMENT IN BELOW TO COMPILE DIFFERENT MODEL #
|
261 |
+
###############################################
|
262 |
+
#
|
263 |
+
# from optimum.neuron import NeuronStableDiffusionXLPipeline
|
264 |
+
#
|
265 |
+
# # model id you want to compile
|
266 |
+
# vanilla_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
267 |
+
#
|
268 |
+
# # configs for compiling model
|
269 |
+
# compiler_args = {"auto_cast": "all", "auto_cast_type": "bf16"}
|
270 |
+
# input_shapes = {
|
271 |
+
# "height": 1024, # width of the image
|
272 |
+
# "width": 1024, # height of the image
|
273 |
+
# "num_images_per_prompt": 1, # number of images to generate per prompt
|
274 |
+
# "batch_size": 1 # batch size for the model
|
275 |
+
# }
|
276 |
+
#
|
277 |
+
# sd = NeuronStableDiffusionXLPipeline.from_pretrained(vanilla_model_id, export=True, **input_shapes, **compiler_args)
|
278 |
+
#
|
279 |
+
# # Save locally or upload to the HuggingFace Hub
|
280 |
+
# save_directory = "sdxl_neuron"
|
281 |
+
# sd.save_pretrained(save_directory)
|
282 |
+
```
|
283 |
+
|
284 |
+
## 2. Create a custom `inference.py` script for Stable Diffusion
|
285 |
+
|
286 |
+
The [Hugging Face Inference Toolkit](https://github.com/aws/sagemaker-huggingface-inference-toolkit) supports zero-code deployments on top of the [pipeline feature](https://huggingface.co/transformers/main_classes/pipelines.html) from 🤗 Transformers. This allows users to deploy Hugging Face transformers without an inference script [[Example](https://github.com/huggingface/notebooks/blob/master/sagemaker/11_deploy_model_from_hf_hub/deploy_transformer_model_from_hf_hub.ipynb)].
|
287 |
+
|
288 |
+
Currently is this feature not supported with AWS Inferentia2, which means we need to provide an `inference.py` for running inference. But `optimum-neuron` has integrated support for the 🤗 Diffusers pipeline feature. That way we can use the `optimum-neuron` to create a pipeline for our model.
|
289 |
+
|
290 |
+
If you want to know more about the `inference.py` script check out this [example](https://github.com/huggingface/notebooks/blob/master/sagemaker/17_custom_inference_script/sagemaker-notebook.ipynb). It explains amongst other things what the `model_fn` and `predict_fn` are.
|
291 |
+
|
292 |
+
|
293 |
+
```python
|
294 |
+
# create code directory in our model directory
|
295 |
+
!mkdir {save_directory}/code
|
296 |
+
```
|
297 |
+
|
298 |
+
We are using the `NEURON_RT_NUM_CORES=2` to make sure that each HTTP worker uses 2 Neuron core to maximize throughput.
|
299 |
+
|
300 |
+
|
301 |
+
```python
|
302 |
+
%%writefile {save_directory}/code/inference.py
|
303 |
+
import os
|
304 |
+
# To use two neuron core per worker
|
305 |
+
os.environ["NEURON_RT_NUM_CORES"] = "2"
|
306 |
+
import torch
|
307 |
+
import torch_neuronx
|
308 |
+
import base64
|
309 |
+
from io import BytesIO
|
310 |
+
from optimum.neuron import NeuronStableDiffusionXLPipeline
|
311 |
+
|
312 |
+
|
313 |
+
def model_fn(model_dir):
|
314 |
+
# load local converted model into pipeline
|
315 |
+
pipeline = NeuronStableDiffusionXLPipeline.from_pretrained(model_dir, device_ids=[0, 1])
|
316 |
+
return pipeline
|
317 |
+
|
318 |
+
|
319 |
+
def predict_fn(data, pipeline):
|
320 |
+
# extract prompt from data
|
321 |
+
prompt = data.pop("inputs", data)
|
322 |
+
|
323 |
+
parameters = data.pop("parameters", None)
|
324 |
+
|
325 |
+
if parameters is not None:
|
326 |
+
generated_images = pipeline(prompt, **parameters)["images"]
|
327 |
+
else:
|
328 |
+
generated_images = pipeline(prompt)["images"]
|
329 |
+
|
330 |
+
# postprocess convert image into base64 string
|
331 |
+
encoded_images = []
|
332 |
+
for image in generated_images:
|
333 |
+
buffered = BytesIO()
|
334 |
+
image.save(buffered, format="JPEG")
|
335 |
+
encoded_images.append(base64.b64encode(buffered.getvalue()).decode())
|
336 |
+
|
337 |
+
# always return the first
|
338 |
+
return {"generated_images": encoded_images}
|
339 |
+
```
|
340 |
+
|
341 |
+
## 3. Upload the neuron model and inference script to Amazon S3
|
342 |
+
|
343 |
+
Before we can deploy our neuron model to Amazon SageMaker we need to upload it all our model artifacts to Amazon S3.
|
344 |
+
|
345 |
+
_Note: Currently `inf2` instances are only available in the `us-east-2` & `us-east-1` region [[REF](https://aws.amazon.com/de/about-aws/whats-new/2023/05/sagemaker-ml-inf2-ml-trn1-instances-model-deployment/)]. Therefore we need to force the region to us-east-2._
|
346 |
+
|
347 |
+
Lets create our SageMaker session and upload our model to Amazon S3.
|
348 |
+
|
349 |
+
|
350 |
+
```python
|
351 |
+
import sagemaker
|
352 |
+
import boto3
|
353 |
+
sess = sagemaker.Session()
|
354 |
+
# sagemaker session bucket -> used for uploading data, models and logs
|
355 |
+
# sagemaker will automatically create this bucket if it not exists
|
356 |
+
sagemaker_session_bucket=None
|
357 |
+
if sagemaker_session_bucket is None and sess is not None:
|
358 |
+
# set to default bucket if a bucket name is not given
|
359 |
+
sagemaker_session_bucket = sess.default_bucket()
|
360 |
+
|
361 |
+
try:
|
362 |
+
role = sagemaker.get_execution_role()
|
363 |
+
except ValueError:
|
364 |
+
iam = boto3.client('iam')
|
365 |
+
role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
|
366 |
+
|
367 |
+
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
|
368 |
+
|
369 |
+
print(f"sagemaker role arn: {role}")
|
370 |
+
print(f"sagemaker bucket: {sess.default_bucket()}")
|
371 |
+
print(f"sagemaker session region: {sess.boto_region_name}")
|
372 |
+
assert sess.boto_region_name in ["us-east-2", "us-east-1"] , "region must be us-east-2 or us-west-2, due to instance availability"
|
373 |
+
```
|
374 |
+
|
375 |
+
We create our `model.tar.gz` with our `inference.py`` script
|
376 |
+
|
377 |
+
|
378 |
+
|
379 |
+
```python
|
380 |
+
# create a model.tar.gz archive with all the model artifacts and the inference.py script.
|
381 |
+
%cd {save_directory}
|
382 |
+
!tar zcvf model.tar.gz *
|
383 |
+
%cd ..
|
384 |
+
```
|
385 |
+
|
386 |
+
Next, we upload our `model.tar.gz` to Amazon S3 using our session bucket and `sagemaker` sdk.
|
387 |
+
|
388 |
+
|
389 |
+
```python
|
390 |
+
from sagemaker.s3 import S3Uploader
|
391 |
+
|
392 |
+
# create s3 uri
|
393 |
+
s3_model_path = f"s3://{sess.default_bucket()}/neuronx/sdxl"
|
394 |
+
|
395 |
+
# upload model.tar.gz
|
396 |
+
s3_model_uri = S3Uploader.upload(local_path=f"{save_directory}/model.tar.gz", desired_s3_uri=s3_model_path)
|
397 |
+
print(f"model artifcats uploaded to {s3_model_uri}")
|
398 |
+
```
|
399 |
+
|
400 |
+
## 4. Deploy a Real-time Inference Endpoint on Amazon SageMaker
|
401 |
+
|
402 |
+
After we have uploaded our model artifacts to Amazon S3 can we create a custom `HuggingfaceModel`. This class will be used to create and deploy our real-time inference endpoint on Amazon SageMaker.
|
403 |
+
|
404 |
+
The `inf2.xlarge` instance type is the smallest instance type with AWS Inferentia2 support. It comes with 1 Inferentia2 chip with 2 Neuron Cores. This means we can use 2 Neuron Cores to minimize latency for our image generation.
|
405 |
+
|
406 |
+
|
407 |
+
```python
|
408 |
+
from sagemaker.huggingface.model import HuggingFaceModel
|
409 |
+
|
410 |
+
# create Hugging Face Model Class
|
411 |
+
huggingface_model = HuggingFaceModel(
|
412 |
+
model_data=s3_model_uri, # path to your model.tar.gz on s3
|
413 |
+
role=role, # iam role with permissions to create an Endpoint
|
414 |
+
transformers_version="4.34.1", # transformers version used
|
415 |
+
pytorch_version="1.13.1", # pytorch version used
|
416 |
+
py_version='py310', # python version used
|
417 |
+
model_server_workers=1, # number of workers for the model server
|
418 |
+
)
|
419 |
+
|
420 |
+
# deploy the endpoint endpoint
|
421 |
+
predictor = huggingface_model.deploy(
|
422 |
+
initial_instance_count=1, # number of instances
|
423 |
+
instance_type="ml.inf2.xlarge", # AWS Inferentia Instance
|
424 |
+
volume_size = 100
|
425 |
+
)
|
426 |
+
# ignore the "Your model is not compiled. Please compile your model before using Inferentia." warning, we already compiled our model.
|
427 |
+
```
|
428 |
+
|
429 |
+
# 5.Generate images using the deployed model
|
430 |
+
|
431 |
+
The `.deploy()` returns an `HuggingFacePredictor` object which can be used to request inference. Our endpoint expects a `json` with at least `inputs` key. The `inputs` key is the input prompt for the model, which will be used to generate the image. Additionally, we can provide inference parameters, e.g. `num_inference_steps`.
|
432 |
+
|
433 |
+
The `predictor.predict()` function returns a `json` with the `generated_images` key. The `generated_images` key contains the `1` generated image as a `base64` encoded string. To decode our response we added a small helper function `decode_base64_to_image` which takes the `base64` encoded string and returns a `PIL.Image` object and `display_image` displays them.
|
434 |
+
|
435 |
+
|
436 |
+
```python
|
437 |
+
from PIL import Image
|
438 |
+
from io import BytesIO
|
439 |
+
from IPython.display import display
|
440 |
+
import base64
|
441 |
+
|
442 |
+
# helper decoder
|
443 |
+
def decode_base64_image(image_string):
|
444 |
+
base64_image = base64.b64decode(image_string)
|
445 |
+
buffer = BytesIO(base64_image)
|
446 |
+
return Image.open(buffer)
|
447 |
+
|
448 |
+
# display PIL images as grid
|
449 |
+
def display_image(image=None,width=500,height=500):
|
450 |
+
img = image.resize((width, height))
|
451 |
+
display(img)
|
452 |
+
```
|
453 |
+
|
454 |
+
Now, lets generate some images. As example `A dog trying catch a flying pizza in style of comic book, at a street corner.`. Generating an image with 25 steps takes around ~6 seconds, except for the first request which can take 45-60s.
|
455 |
+
_note: If the request times out, just rerun again. Only the first request takes a long time._
|
456 |
+
|
457 |
+
|
458 |
+
```python
|
459 |
+
prompt = "A dog trying catch a flying pizza at a street corner, comic book, well lit, night time"
|
460 |
+
|
461 |
+
# run prediction
|
462 |
+
response = predictor.predict(data={
|
463 |
+
"inputs": prompt,
|
464 |
+
"parameters": {
|
465 |
+
"num_inference_steps" : 25,
|
466 |
+
"negative_prompt" : "disfigured, ugly, deformed"
|
467 |
+
}
|
468 |
+
}
|
469 |
+
)
|
470 |
+
|
471 |
+
# decode and display image
|
472 |
+
display_image(decode_base64_image(response["generated_images"][0]))
|
473 |
+
```
|
474 |
+
|
475 |
+
|
476 |
+
|
477 |
+
|
478 |
+
### Delete model and endpoint
|
479 |
+
|
480 |
+
To clean up, we can delete the model and endpoint.
|
481 |
+
|
482 |
+
|
483 |
+
```python
|
484 |
+
predictor.delete_model()
|
485 |
+
predictor.delete_endpoint()
|
486 |
+
```
|
487 |
+
|
488 |
+
|
489 |
+
```python
|
490 |
+
|
491 |
+
```
|
492 |
+
|
493 |
+
"""
|
494 |
+
|
495 |
+
)
|
496 |
|
497 |
+
with st.expander("(2) AWS Lambda Function to handle inference requests"):
|
498 |
+
st.markdown(
|
499 |
+
"""
|
500 |
+
```python
|
501 |
+
import boto3
|
502 |
+
import json
|
503 |
+
|
504 |
+
def lambda_handler(event, context):
|
505 |
+
# SageMaker endpoint details
|
506 |
+
endpoint_name = 'INSERT_YOUR_SAGEMAKER_ENDPOINT_NAME_HERE'
|
507 |
+
runtime = boto3.client('sagemaker-runtime')
|
508 |
+
|
509 |
+
# Sample input data (modify as per your model's input requirements)
|
510 |
+
# Get the prompt from the Lambda function input
|
511 |
+
print("======== event payload: ==========")
|
512 |
+
print(event['body'])
|
513 |
+
|
514 |
+
print("======== prompt payload: ==========")
|
515 |
+
event_parsed = json.loads(event['body'])
|
516 |
+
prompt = event_parsed.get('prompt', '')
|
517 |
+
print(prompt)
|
518 |
+
print("======== params payload: ==========")
|
519 |
+
params = event_parsed.get('parameters','')
|
520 |
+
print(params)
|
521 |
+
|
522 |
+
# Prepare input data
|
523 |
+
model_input = {
|
524 |
+
'inputs': prompt,
|
525 |
+
'parameters': params
|
526 |
+
}
|
527 |
+
|
528 |
+
input_data = json.dumps(model_input)
|
529 |
+
|
530 |
+
# Make a prediction request to the SageMaker endpoint
|
531 |
+
response = runtime.invoke_endpoint(EndpointName=endpoint_name,
|
532 |
+
ContentType='application/json',
|
533 |
+
Body=input_data)
|
534 |
+
|
535 |
+
# Parse the response
|
536 |
+
result = response['Body'].read()
|
537 |
+
return {
|
538 |
+
'statusCode': 200,
|
539 |
+
'body': result
|
540 |
+
}
|
541 |
+
|
542 |
+
```
|
543 |
+
|
544 |
+
"""
|
545 |
+
)
|
546 |
+
|
547 |
+
with st.expander("(3) Streamlit app.py, running on Amazon EC2 t2.micro instance"):
|
548 |
+
st.markdown(
|
549 |
+
"""
|
550 |
+
```python
|
551 |
+
import streamlit as st
|
552 |
+
# Set the page layout to 'wide'
|
553 |
+
st.set_page_config(layout="wide")
|
554 |
+
import requests
|
555 |
+
from PIL import Image
|
556 |
+
from io import BytesIO
|
557 |
+
import base64
|
558 |
+
import time
|
559 |
+
|
560 |
+
|
561 |
+
|
562 |
+
# helper decoder
|
563 |
+
def decode_base64_image(image_string):
|
564 |
+
base64_image = base64.b64decode(image_string)
|
565 |
+
buffer = BytesIO(base64_image)
|
566 |
+
return Image.open(buffer)
|
567 |
+
|
568 |
+
# display PIL images as grid
|
569 |
+
def display_image(image=None,width=500,height=500):
|
570 |
+
img = image.resize((width, height))
|
571 |
+
return img
|
572 |
+
|
573 |
+
# API Gateway endpoint URL
|
574 |
+
api_url = 'INSERT_YOUR_API_GATEWAY_ENDPOINT_URL_HERE'
|
575 |
+
# Create two columns for layout
|
576 |
+
left_column, right_column = st.columns(2)
|
577 |
+
# ===========
|
578 |
+
with left_column:
|
579 |
+
# Define Streamlit UI elements
|
580 |
+
st.title('Stable Diffusion XL Image Generation with AWS Inferentia')
|
581 |
+
|
582 |
+
prompt_one = st.text_area("Enter your prompt:",
|
583 |
+
f"Raccoon astronaut in space, sci-fi, future, cold color palette, muted colors, detailed, 8k")
|
584 |
+
|
585 |
+
# Number of inference steps
|
586 |
+
num_inference_steps_one = st.slider("Number of Inference Steps",
|
587 |
+
min_value=1,
|
588 |
+
max_value=100,
|
589 |
+
value=30,
|
590 |
+
help="More steps might improve quality, with diminishing marginal returns. 30-50 seems best, but your mileage may vary.")
|
591 |
+
|
592 |
+
# Create an expandable section for optional parameters
|
593 |
+
with st.expander("Optional Parameters"):
|
594 |
+
# Random seed input
|
595 |
+
seed_one = st.number_input("Random seed",
|
596 |
+
value=555,
|
597 |
+
help="Set to the same value to generate the same image if other inputs are the same, change to generate a different image for same inputs.")
|
598 |
+
|
599 |
+
# Negative prompt input
|
600 |
+
negative_prompt_one = st.text_area("Enter your negative prompt:",
|
601 |
+
"cartoon, graphic, text, painting, crayon, graphite, abstract glitch, blurry")
|
602 |
+
|
603 |
+
|
604 |
+
|
605 |
+
|
606 |
+
|
607 |
+
|
608 |
+
|
609 |
+
if st.button('Generate Image'):
|
610 |
+
with st.spinner(f'Generating Image with {num_inference_steps_one} iterations'):
|
611 |
+
with right_column:
|
612 |
+
start_time = time.time()
|
613 |
+
# ===============
|
614 |
+
# Example input data
|
615 |
+
prompt_input_one = {
|
616 |
+
"prompt": prompt_one,
|
617 |
+
"parameters": {
|
618 |
+
"num_inference_steps": num_inference_steps_one,
|
619 |
+
"seed": seed_one,
|
620 |
+
"negative_prompt": negative_prompt_one
|
621 |
+
}
|
622 |
+
}
|
623 |
+
|
624 |
+
# Make API request
|
625 |
+
response_one = requests.post(api_url, json=prompt_input_one)
|
626 |
+
|
627 |
+
# Process and display the response
|
628 |
+
if response_one.status_code == 200:
|
629 |
+
result_one = response_one.json()
|
630 |
+
# st.success(f"Prediction result: {result}")
|
631 |
+
image_one = display_image(decode_base64_image(result_one["generated_images"][0]))
|
632 |
+
st.image(image_one,
|
633 |
+
caption=f"{prompt_one}")
|
634 |
+
end_time = time.time()
|
635 |
+
total_time = round(end_time - start_time, 2)
|
636 |
+
st.text(f"Prompt: {prompt_one}")
|
637 |
+
st.text(f"Number of Iterations: {num_inference_steps_one}")
|
638 |
+
st.text(f"Random Seed: {seed_one}")
|
639 |
+
st.text(f'Total time taken: {total_time} seconds')
|
640 |
+
# Calculate and display the time per iteration in milliseconds
|
641 |
+
time_per_iteration_ms = (total_time / num_inference_steps_one)
|
642 |
+
st.text(f'Time per iteration: {time_per_iteration_ms:.2f} seconds')
|
643 |
+
else:
|
644 |
+
st.error(f"Error: {response_one.text}")
|
645 |
+
```
|
646 |
+
|
647 |
+
"""
|
648 |
+
)
|