valhalla commited on
Commit
b9093ab
Β·
1 Parent(s): d3a3be4
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +194 -0
  3. requirements.txt +3 -0
  4. server.py +175 -0
README.md CHANGED
@@ -3,7 +3,7 @@ title: Glide Text2im
3
  emoji: πŸ“Š
4
  colorFrom: purple
5
  colorTo: gray
6
- sdk: streamlit
7
  app_file: app.py
8
  pinned: false
9
  ---
 
3
  emoji: πŸ“Š
4
  colorFrom: purple
5
  colorTo: gray
6
+ sdk: gradio
7
  app_file: app.py
8
  pinned: false
9
  ---
app.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+
4
+ import base64
5
+ from io import BytesIO
6
+ # from fastapi import FastAPI
7
+
8
+ from PIL import Image
9
+ import torch as th
10
+
11
+ from glide_text2im.download import load_checkpoint
12
+ from glide_text2im.model_creation import (
13
+ create_model_and_diffusion,
14
+ model_and_diffusion_defaults,
15
+ model_and_diffusion_defaults_upsampler
16
+ )
17
+
18
+ # print("Loading models...")
19
+ # app = FastAPI()
20
+
21
+ # This notebook supports both CPU and GPU.
22
+ # On CPU, generating one sample may take on the order of 20 minutes.
23
+ # On a GPU, it should be under a minute.
24
+
25
+ has_cuda = th.cuda.is_available()
26
+ device = th.device('cpu' if not has_cuda else 'cuda')
27
+
28
+ # Create base model.
29
+ options = model_and_diffusion_defaults()
30
+ options['use_fp16'] = has_cuda
31
+ options['timestep_respacing'] = '100' # use 100 diffusion steps for fast sampling
32
+ model, diffusion = create_model_and_diffusion(**options)
33
+ model.eval()
34
+ if has_cuda:
35
+ model.convert_to_fp16()
36
+ model.to(device)
37
+ model.load_state_dict(load_checkpoint('base', device))
38
+ print('total base parameters', sum(x.numel() for x in model.parameters()))
39
+
40
+ # Create upsampler model.
41
+ options_up = model_and_diffusion_defaults_upsampler()
42
+ options_up['use_fp16'] = has_cuda
43
+ options_up['timestep_respacing'] = 'fast27' # use 27 diffusion steps for very fast sampling
44
+ model_up, diffusion_up = create_model_and_diffusion(**options_up)
45
+ model_up.eval()
46
+ if has_cuda:
47
+ model_up.convert_to_fp16()
48
+ model_up.to(device)
49
+ model_up.load_state_dict(load_checkpoint('upsample', device))
50
+ print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))
51
+
52
+
53
+ def get_images(batch: th.Tensor):
54
+ """ Display a batch of images inline. """
55
+ scaled = ((batch + 1)*127.5).round().clamp(0,255).to(th.uint8).cpu()
56
+ reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])
57
+ Image.fromarray(reshaped.numpy())
58
+
59
+
60
+ # Create a classifier-free guidance sampling function
61
+ guidance_scale = 3.0
62
+
63
+ def model_fn(x_t, ts, **kwargs):
64
+ half = x_t[: len(x_t) // 2]
65
+ combined = th.cat([half, half], dim=0)
66
+ model_out = model(combined, ts, **kwargs)
67
+ eps, rest = model_out[:, :3], model_out[:, 3:]
68
+ cond_eps, uncond_eps = th.split(eps, len(eps) // 2, dim=0)
69
+ half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
70
+ eps = th.cat([half_eps, half_eps], dim=0)
71
+ return th.cat([eps, rest], dim=1)
72
+
73
+
74
+ # @app.get("/")
75
+ def read_root():
76
+ return {"glide!"}
77
+
78
+ # @app.get("/{generate}")
79
+ def sample(prompt):
80
+ # Sampling parameters
81
+ batch_size = 1
82
+
83
+ # Tune this parameter to control the sharpness of 256x256 images.
84
+ # A value of 1.0 is sharper, but sometimes results in grainy artifacts.
85
+ upsample_temp = 0.997
86
+
87
+ ##############################
88
+ # Sample from the base model #
89
+ ##############################
90
+
91
+ # Create the text tokens to feed to the model.
92
+ tokens = model.tokenizer.encode(prompt)
93
+ tokens, mask = model.tokenizer.padded_tokens_and_mask(
94
+ tokens, options['text_ctx']
95
+ )
96
+
97
+ # Create the classifier-free guidance tokens (empty)
98
+ full_batch_size = batch_size * 2
99
+ uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
100
+ [], options['text_ctx']
101
+ )
102
+
103
+ # Pack the tokens together into model kwargs.
104
+ model_kwargs = dict(
105
+ tokens=th.tensor(
106
+ [tokens] * batch_size + [uncond_tokens] * batch_size, device=device
107
+ ),
108
+ mask=th.tensor(
109
+ [mask] * batch_size + [uncond_mask] * batch_size,
110
+ dtype=th.bool,
111
+ device=device,
112
+ ),
113
+ )
114
+
115
+ # Sample from the base model.
116
+ model.del_cache()
117
+ samples = diffusion.p_sample_loop(
118
+ model_fn,
119
+ (full_batch_size, 3, options["image_size"], options["image_size"]),
120
+ device=device,
121
+ clip_denoised=True,
122
+ progress=True,
123
+ model_kwargs=model_kwargs,
124
+ cond_fn=None,
125
+ )[:batch_size]
126
+ model.del_cache()
127
+
128
+
129
+ ##############################
130
+ # Upsample the 64x64 samples #
131
+ ##############################
132
+
133
+ tokens = model_up.tokenizer.encode(prompt)
134
+ tokens, mask = model_up.tokenizer.padded_tokens_and_mask(
135
+ tokens, options_up['text_ctx']
136
+ )
137
+
138
+ # Create the model conditioning dict.
139
+ model_kwargs = dict(
140
+ # Low-res image to upsample.
141
+ low_res=((samples+1)*127.5).round()/127.5 - 1,
142
+
143
+ # Text tokens
144
+ tokens=th.tensor(
145
+ [tokens] * batch_size, device=device
146
+ ),
147
+ mask=th.tensor(
148
+ [mask] * batch_size,
149
+ dtype=th.bool,
150
+ device=device,
151
+ ),
152
+ )
153
+
154
+ # Sample from the base model.
155
+ model_up.del_cache()
156
+ up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
157
+ up_samples = diffusion_up.ddim_sample_loop(
158
+ model_up,
159
+ up_shape,
160
+ noise=th.randn(up_shape, device=device) * upsample_temp,
161
+ device=device,
162
+ clip_denoised=True,
163
+ progress=True,
164
+ model_kwargs=model_kwargs,
165
+ cond_fn=None,
166
+ )[:batch_size]
167
+ model_up.del_cache()
168
+
169
+ # Show the output
170
+ image = get_images(up_samples)
171
+ image = to_base64(image)
172
+ # return {"image": image}
173
+ return image
174
+
175
+
176
+ def to_base64(pil_image):
177
+ buffered = BytesIO()
178
+ pil_image.save(buffered, format="JPEG")
179
+ return base64.b64encode(buffered.getvalue())
180
+
181
+ title = "Interactive demo: glide-text2im"
182
+ description = "Demo for OpenAI's GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models."
183
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2109.10282'>GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models</a> | <a href='https://openai.com/blog/image-gpt/'>Official blog</a></p>"
184
+ examples =["Eiffel tower"]
185
+
186
+ iface = gr.Interface(fn=sample,
187
+ inputs=gr.inputs.Image(type="text"),
188
+ outputs=gr.outputs.Image(type="pil", label="Model input + completions"),
189
+ title=title,
190
+ description=description,
191
+ article=article,
192
+ examples=examples,
193
+ enable_queue=True)
194
+ iface.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/openai/glide-text2im.git
2
+ fastapi
3
+ uvicorn
server.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from io import BytesIO
3
+ from fastapi import FastAPI
4
+
5
+ from PIL import Image
6
+ import torch as th
7
+
8
+ from glide_text2im.download import load_checkpoint
9
+ from glide_text2im.model_creation import (
10
+ create_model_and_diffusion,
11
+ model_and_diffusion_defaults,
12
+ model_and_diffusion_defaults_upsampler
13
+ )
14
+
15
+ print("Loading models...")
16
+ app = FastAPI()
17
+
18
+ # This notebook supports both CPU and GPU.
19
+ # On CPU, generating one sample may take on the order of 20 minutes.
20
+ # On a GPU, it should be under a minute.
21
+
22
+ has_cuda = th.cuda.is_available()
23
+ device = th.device('cpu' if not has_cuda else 'cuda')
24
+
25
+ # Create base model.
26
+ options = model_and_diffusion_defaults()
27
+ options['use_fp16'] = has_cuda
28
+ options['timestep_respacing'] = '100' # use 100 diffusion steps for fast sampling
29
+ model, diffusion = create_model_and_diffusion(**options)
30
+ model.eval()
31
+ if has_cuda:
32
+ model.convert_to_fp16()
33
+ model.to(device)
34
+ model.load_state_dict(load_checkpoint('base', device))
35
+ print('total base parameters', sum(x.numel() for x in model.parameters()))
36
+
37
+ # Create upsampler model.
38
+ options_up = model_and_diffusion_defaults_upsampler()
39
+ options_up['use_fp16'] = has_cuda
40
+ options_up['timestep_respacing'] = 'fast27' # use 27 diffusion steps for very fast sampling
41
+ model_up, diffusion_up = create_model_and_diffusion(**options_up)
42
+ model_up.eval()
43
+ if has_cuda:
44
+ model_up.convert_to_fp16()
45
+ model_up.to(device)
46
+ model_up.load_state_dict(load_checkpoint('upsample', device))
47
+ print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))
48
+
49
+
50
+ def get_images(batch: th.Tensor):
51
+ """ Display a batch of images inline. """
52
+ scaled = ((batch + 1)*127.5).round().clamp(0,255).to(th.uint8).cpu()
53
+ reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])
54
+ Image.fromarray(reshaped.numpy())
55
+
56
+
57
+ # Create a classifier-free guidance sampling function
58
+ guidance_scale = 3.0
59
+
60
+ def model_fn(x_t, ts, **kwargs):
61
+ half = x_t[: len(x_t) // 2]
62
+ combined = th.cat([half, half], dim=0)
63
+ model_out = model(combined, ts, **kwargs)
64
+ eps, rest = model_out[:, :3], model_out[:, 3:]
65
+ cond_eps, uncond_eps = th.split(eps, len(eps) // 2, dim=0)
66
+ half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
67
+ eps = th.cat([half_eps, half_eps], dim=0)
68
+ return th.cat([eps, rest], dim=1)
69
+
70
+
71
+ @app.get("/")
72
+ def read_root():
73
+ return {"glide!"}
74
+
75
+ @app.get("/{generate}")
76
+ def sample(prompt):
77
+ # Sampling parameters
78
+ batch_size = 1
79
+
80
+ # Tune this parameter to control the sharpness of 256x256 images.
81
+ # A value of 1.0 is sharper, but sometimes results in grainy artifacts.
82
+ upsample_temp = 0.997
83
+
84
+ ##############################
85
+ # Sample from the base model #
86
+ ##############################
87
+
88
+ # Create the text tokens to feed to the model.
89
+ tokens = model.tokenizer.encode(prompt)
90
+ tokens, mask = model.tokenizer.padded_tokens_and_mask(
91
+ tokens, options['text_ctx']
92
+ )
93
+
94
+ # Create the classifier-free guidance tokens (empty)
95
+ full_batch_size = batch_size * 2
96
+ uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
97
+ [], options['text_ctx']
98
+ )
99
+
100
+ # Pack the tokens together into model kwargs.
101
+ model_kwargs = dict(
102
+ tokens=th.tensor(
103
+ [tokens] * batch_size + [uncond_tokens] * batch_size, device=device
104
+ ),
105
+ mask=th.tensor(
106
+ [mask] * batch_size + [uncond_mask] * batch_size,
107
+ dtype=th.bool,
108
+ device=device,
109
+ ),
110
+ )
111
+
112
+ # Sample from the base model.
113
+ model.del_cache()
114
+ samples = diffusion.p_sample_loop(
115
+ model_fn,
116
+ (full_batch_size, 3, options["image_size"], options["image_size"]),
117
+ device=device,
118
+ clip_denoised=True,
119
+ progress=True,
120
+ model_kwargs=model_kwargs,
121
+ cond_fn=None,
122
+ )[:batch_size]
123
+ model.del_cache()
124
+
125
+
126
+ ##############################
127
+ # Upsample the 64x64 samples #
128
+ ##############################
129
+
130
+ tokens = model_up.tokenizer.encode(prompt)
131
+ tokens, mask = model_up.tokenizer.padded_tokens_and_mask(
132
+ tokens, options_up['text_ctx']
133
+ )
134
+
135
+ # Create the model conditioning dict.
136
+ model_kwargs = dict(
137
+ # Low-res image to upsample.
138
+ low_res=((samples+1)*127.5).round()/127.5 - 1,
139
+
140
+ # Text tokens
141
+ tokens=th.tensor(
142
+ [tokens] * batch_size, device=device
143
+ ),
144
+ mask=th.tensor(
145
+ [mask] * batch_size,
146
+ dtype=th.bool,
147
+ device=device,
148
+ ),
149
+ )
150
+
151
+ # Sample from the base model.
152
+ model_up.del_cache()
153
+ up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
154
+ up_samples = diffusion_up.ddim_sample_loop(
155
+ model_up,
156
+ up_shape,
157
+ noise=th.randn(up_shape, device=device) * upsample_temp,
158
+ device=device,
159
+ clip_denoised=True,
160
+ progress=True,
161
+ model_kwargs=model_kwargs,
162
+ cond_fn=None,
163
+ )[:batch_size]
164
+ model_up.del_cache()
165
+
166
+ # Show the output
167
+ image = get_images(up_samples)
168
+ image = to_base64(image)
169
+ return {"image": image}
170
+
171
+
172
+ def to_base64(pil_image):
173
+ buffered = BytesIO()
174
+ pil_image.save(buffered, format="JPEG")
175
+ return base64.b64encode(buffered.getvalue())