sohojoe commited on
Commit
dcd6afb
1 Parent(s): 489d13e

switch to use clip retreval's clip implementation

Browse files
app.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ from torchvision import transforms
5
+ # from diffusers import StableDiffusionPipeline, StableDiffusionImageVariationPipeline, DiffusionPipeline
6
+ import numpy as np
7
+ import pandas as pd
8
+ import math
9
+ from transformers import CLIPTextModel, CLIPTokenizer
10
+ import os
11
+
12
+ from clip_retrieval.clip_client import ClipClient, Modality
13
+
14
+
15
+ # clip_model_id = "openai/clip-vit-large-patch14-336"
16
+ # clip_retrieval_indice_name, clip_model_id ="laion5B-L-14", "/laion/CLIP-ViT-L-14-laion2B-s32B-b82K"
17
+ clip_retrieval_service_url = "https://knn.laion.ai/knn-service"
18
+ # available models = ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']
19
+ # clip_model="ViT-B/32"
20
+ clip_model="ViT-L/14"
21
+ clip_model_id ="laion5B-L-14"
22
+
23
+
24
+
25
+ max_tabs = 10
26
+ input_images = [None for i in range(max_tabs)]
27
+ input_prompts = [None for i in range(max_tabs)]
28
+ embedding_plots = [None for i in range(max_tabs)]
29
+ embedding_powers = [1. for i in range(max_tabs)]
30
+ # global embedding_base64s
31
+ embedding_base64s = [None for i in range(max_tabs)]
32
+ # embedding_base64s = gr.State(value=[None for i in range(max_tabs)])
33
+
34
+
35
+ def image_to_embedding(input_im):
36
+ # approch A:
37
+ tform = transforms.Compose([
38
+ transforms.ToTensor(),
39
+ transforms.Resize(
40
+ (336, 336),
41
+ interpolation=transforms.InterpolationMode.BICUBIC,
42
+ antialias=False,
43
+ ),
44
+ transforms.Normalize(
45
+ [0.48145466, 0.4578275, 0.40821073],
46
+ [0.26862954, 0.26130258, 0.27577711]),
47
+ ])
48
+ input = tform(input_im).to(device)
49
+
50
+ # approch B: convert input_im to torch
51
+ # inp = torch.from_numpy(np.array(input_im)).to(device)
52
+ # inp = torch.from_numpy(np.array(input_im)).permute(2, 0, 1).to(device)
53
+
54
+ # dtype = torch.float32
55
+ # input = input.to(device=device, dtype=dtype)
56
+ input = input.unsqueeze(0)
57
+ # image_embeddings = pipe.image_encoder(image).image_embeds
58
+ # image_embeddings = image_embeddings[0]
59
+
60
+ with torch.no_grad():
61
+ # image_embeddings_np = model.get_text_features(prompt_tokens.to(device))
62
+ image_embeddings = model.get_image_features(input)
63
+
64
+ # image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
65
+ image_embeddings_np = image_embeddings.cpu().detach().numpy()
66
+ return image_embeddings_np
67
+
68
+ def prompt_to_embedding(prompt):
69
+ # inputs = processor(prompt, images=imgs, return_tensors="pt", padding=True)
70
+ inputs = processor(prompt, return_tensors="pt", padding='max_length', max_length=77)
71
+ # labels = torch.tensor(labels)
72
+ # prompt_tokens = inputs.input_ids[0]
73
+ prompt_tokens = inputs.input_ids
74
+ # image = inputs.pixel_values
75
+ with torch.no_grad():
76
+ prompt_embededdings = model.get_text_features(prompt_tokens.to(device))
77
+ # prompt_embededdings /= prompt_embededdings.norm(dim=-1, keepdim=True)
78
+ prompt_embededdings = prompt_embededdings[0].cpu().detach().numpy()
79
+ return prompt_embededdings
80
+
81
+ def embedding_to_image(embeddings):
82
+ size = math.ceil(math.sqrt(embeddings.shape[0]))
83
+ image_embeddings_square = np.pad(embeddings, (0, size**2 - embeddings.shape[0]), 'constant')
84
+ image_embeddings_square.resize(size,size)
85
+ embedding_image = Image.fromarray(image_embeddings_square, mode="L")
86
+ return embedding_image
87
+
88
+ def embedding_to_base64(embeddings):
89
+ import base64
90
+ # ensure float16
91
+ embeddings = embeddings.astype(np.float16)
92
+ embeddings_b64 = base64.urlsafe_b64encode(embeddings).decode()
93
+ return embeddings_b64
94
+
95
+ def base64_to_embedding(embeddings_b64):
96
+ import base64
97
+ embeddings = base64.urlsafe_b64decode(embeddings_b64)
98
+ embeddings = np.frombuffer(embeddings, dtype=np.float16)
99
+ # embeddings = torch.tensor(embeddings)
100
+ return embeddings
101
+
102
+ def main(
103
+ # input_im,
104
+ embeddings,
105
+ scale=3.0,
106
+ n_samples=4,
107
+ steps=25,
108
+ seed=None
109
+ ):
110
+
111
+ if seed == None:
112
+ seed = np.random.randint(2147483647)
113
+ # if device contains cuda
114
+ if device.type == 'cuda':
115
+ generator = torch.Generator(device=device).manual_seed(int(seed))
116
+ else:
117
+ generator = torch.Generator().manual_seed(int(seed)) # use cpu as does not work on mps
118
+
119
+ embeddings = base64_to_embedding(embeddings)
120
+ embeddings = torch.tensor(embeddings, dtype=torch_size).to(device)
121
+
122
+ images_list = pipe(
123
+ # inp.tile(n_samples, 1, 1, 1),
124
+ # [embeddings * n_samples],
125
+ embeddings,
126
+ guidance_scale=scale,
127
+ num_inference_steps=steps,
128
+ generator=generator,
129
+ )
130
+
131
+ images = []
132
+ for i, image in enumerate(images_list["images"]):
133
+ images.append(image)
134
+ # images.append(embedding_image)
135
+ return images
136
+
137
+ def on_image_load_update_embeddings(image_data):
138
+ # image to embeddings
139
+ if image_data is None:
140
+ # embeddings = prompt_to_embedding('')
141
+ # embeddings_b64 = embedding_to_base64(embeddings)
142
+ # return gr.Text.update(embeddings_b64)
143
+ return gr.Text.update('')
144
+ embeddings = image_to_embedding(image_data)
145
+ embeddings_b64 = embedding_to_base64(embeddings)
146
+ return gr.Text.update(embeddings_b64)
147
+
148
+ def on_prompt_change_update_embeddings(prompt):
149
+ # prompt to embeddings
150
+ if prompt is None or prompt == "":
151
+ embeddings = prompt_to_embedding('')
152
+ embeddings_b64 = embedding_to_base64(embeddings)
153
+ return gr.Text.update(embedding_to_base64(embeddings))
154
+ embeddings = prompt_to_embedding(prompt)
155
+ embeddings_b64 = embedding_to_base64(embeddings)
156
+ return gr.Text.update(embeddings_b64)
157
+
158
+ def update_average_embeddings(embedding_base64s_state, embedding_powers):
159
+ final_embedding = None
160
+ num_embeddings = 0
161
+ for i, embedding_base64 in enumerate(embedding_base64s_state):
162
+ if embedding_base64 is None or embedding_base64 == "":
163
+ continue
164
+ embedding = base64_to_embedding(embedding_base64)
165
+ embedding = embedding * embedding_powers[i]
166
+ if final_embedding is None:
167
+ final_embedding = embedding
168
+ else:
169
+ final_embedding = final_embedding + embedding
170
+ num_embeddings += 1
171
+ if final_embedding is None:
172
+ # embeddings = prompt_to_embedding('')
173
+ # embeddings_b64 = embedding_to_base64(embeddings)
174
+ # return gr.Text.update(embeddings_b64)
175
+ return gr.Text.update('')
176
+
177
+ # TODO toggle this to support average or sum
178
+ final_embedding = final_embedding / num_embeddings
179
+
180
+ embeddings_b64 = embedding_to_base64(final_embedding)
181
+ return embeddings_b64
182
+
183
+ def on_power_change_update_average_embeddings(embedding_base64s_state, embedding_power_state, power, idx):
184
+ embedding_power_state[idx] = power
185
+ embeddings_b64 = update_average_embeddings(embedding_base64s_state, embedding_power_state)
186
+ return gr.Text.update(embeddings_b64)
187
+
188
+ def on_embeddings_changed_update_average_embeddings(embedding_base64s_state, embedding_power_state, embedding_base64, idx):
189
+ embedding_base64s_state[idx] = embedding_base64 if embedding_base64 != '' else None
190
+ embeddings_b64 = update_average_embeddings(embedding_base64s_state, embedding_power_state)
191
+ return gr.Text.update(embeddings_b64)
192
+
193
+ def on_embeddings_changed_update_plot(embeddings_b64):
194
+ # plot new embeddings
195
+ if embeddings_b64 is None or embeddings_b64 == "":
196
+ data = pd.DataFrame({
197
+ 'embedding': [],
198
+ 'index': []})
199
+ return gr.LinePlot.update(data,
200
+ x="index",
201
+ y="embedding",
202
+ # color="country",
203
+ title="Embeddings",
204
+ # stroke_dash="cluster",
205
+ # x_lim=[1950, 2010],
206
+ tooltip=['index', 'embedding'],
207
+ # stroke_dash_legend_title="Country Cluster",
208
+ # height=300,
209
+ width=0)
210
+
211
+ embeddings = base64_to_embedding(embeddings_b64)
212
+ data = pd.DataFrame({
213
+ 'embedding': embeddings,
214
+ 'index': [n for n in range(len(embeddings))]})
215
+ return gr.LinePlot.update(data,
216
+ x="index",
217
+ y="embedding",
218
+ # color="country",
219
+ title="Embeddings",
220
+ # stroke_dash="cluster",
221
+ # x_lim=[1950, 2010],
222
+ tooltip=['index', 'embedding'],
223
+ # stroke_dash_legend_title="Country Cluster",
224
+ # height=300,
225
+ width=embeddings.shape[0])
226
+
227
+ def on_example_image_click_set_image(input_image, image_url):
228
+ input_image.value = image_url
229
+
230
+ # device = torch.device("mps" if torch.backends.mps.is_available() else "cuda:0" if torch.cuda.is_available() else "cpu")
231
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
232
+ torch_size = torch.float16 if device == ('cuda') else torch.float32
233
+ # torch_size = torch.float32
234
+ # pipe = StableDiffusionPipeline.from_pretrained(
235
+ # model_id,
236
+ # custom_pipeline="pipeline.py",
237
+ # torch_dtype=torch_size,
238
+ # # , revision="fp16",
239
+ # requires_safety_checker = False, safety_checker=None,
240
+ # text_encoder = CLIPTextModel,
241
+ # tokenizer = CLIPTokenizer,
242
+ # )
243
+ # pipe = pipe.to(device)
244
+
245
+ from transformers import AutoProcessor, AutoModel
246
+ # processor = AutoProcessor.from_pretrained(clip_model_id)
247
+ # model = AutoModel.from_pretrained(clip_model_id)
248
+ # model = model.to(device)
249
+
250
+ from clip_retrieval.load_clip import load_clip, get_tokenizer
251
+ # model, preprocess = load_clip(clip_model, use_jit=True, device=device)
252
+ model, preprocess = load_clip(clip_model, use_jit=True, device=device)
253
+ tokenizer = get_tokenizer(clip_model)
254
+
255
+ test_url = "https://placekitten.com/400/600"
256
+ test_caption = "an image of a cat"
257
+ test_image_1 = "tests/test_clip_inference/test_images/123_456.jpg"
258
+ test_image_2 = "tests/test_clip_inference/test_images/416_264.jpg"
259
+
260
+ # clip_retrieval_service_url = "https://knn.laion.ai/knn-service"
261
+ clip_retrieval_client = ClipClient(
262
+ url=clip_retrieval_service_url,
263
+ indice_name=clip_model_id,
264
+ use_safety_model = False,
265
+ use_violence_detector = False,
266
+ )
267
+ # results = clip_retrieval_client.query(text="an image of a cat")
268
+ # results[0]
269
+
270
+ examples = [
271
+ ["SohoJoeEth.jpeg", "Ray-Liotta-Goodfellas.jpg", "SohoJoeEth + Ray.jpeg"],
272
+ # ["SohoJoeEth.jpeg", "Donkey.jpg", "SohoJoeEth + Donkey.jpeg"],
273
+ # ["SohoJoeEth.jpeg", "Snoop Dogg.jpg", "SohoJoeEth + Snoop Dogg.jpeg"],
274
+ ]
275
+ tile_size = 100
276
+ # image_folder = os.path.join("file", "images")
277
+ image_folder ="images"
278
+
279
+ # image_examples = {
280
+ # "452650": "452650.jpeg",
281
+ # "Prompt 1": "a college dorm with a desk and bunk beds",
282
+ # "371739": "371739.jpeg",
283
+ # "Prompt 2": "a large banana is placed before a stuffed monkey.",
284
+ # "557922": "557922.jpeg",
285
+ # "Prompt 3": "a person sitting on a bench using a cell phone",
286
+
287
+ # }
288
+
289
+ tabbed_examples = {
290
+ "CoCo": {
291
+ "452650": "452650.jpeg",
292
+ "Prompt 1": "a college dorm with a desk and bunk beds",
293
+ "371739": "371739.jpeg",
294
+ "Prompt 2": "a large banana is placed before a stuffed monkey.",
295
+ "557922": "557922.jpeg",
296
+ "Prompt 3": "a person sitting on a bench using a cell phone",
297
+ "540554": "540554.jpeg",
298
+ "Prompt 4": "two trains are coming down the tracks, a steam engine and a modern train.",
299
+ },
300
+ "Transforms": {
301
+ "ColorWheel001": "ColorWheel001.jpg",
302
+ "ColorWheel001 BW": "ColorWheel001 BW.jpg",
303
+ "ColorWheel002": "ColorWheel002.jpg",
304
+ "ColorWheel002 BW": "ColorWheel002 BW.jpg",
305
+ },
306
+ "Portraits": {
307
+ "Snoop": "Snoop Dogg.jpg",
308
+ "Snoop Prompt": "Snoop Dogg",
309
+ "Ray": "Ray-Liotta-Goodfellas.jpg",
310
+ "Ray Prompt": "Ray Liotta, Goodfellas",
311
+ "Anya": "Anya Taylor-Joy 003.jpg",
312
+ "Anya Prompt": "Anya Taylor-Joy, The Queen's Gambit",
313
+ "Billie": "billie eilish 004.jpeg",
314
+ "Billie Prompt": "Billie Eilish, blonde hair",
315
+ "Lizzo": "Lizzo 001.jpeg",
316
+ "Lizzo Prompt": "Lizzo,",
317
+ "Donkey": "Donkey.jpg",
318
+ "Donkey Prompt": "Donkey, from Shrek",
319
+ },
320
+ "NFT's": {
321
+ "SohoJoe": "SohoJoeEth.jpeg",
322
+ "SohoJoe Prompt": "SohoJoe.Eth",
323
+ "Mirai": "Mirai.jpg",
324
+ "Mirai Prompt": "Mirai from White Rabbit, @shibuyaxyz",
325
+ "OnChainMonkey": "OnChainMonkey-2278.jpg",
326
+ "OCM Prompt": "On Chain Monkey",
327
+ "Wassie": "Wassie 4498.jpeg",
328
+ "Wassie Prompt": "Wassie by Wassies",
329
+ },
330
+ "Pups": {
331
+ "Pup1": "pup1.jpg",
332
+ "Prompt": "Teacup Yorkies",
333
+ "Pup2": "pup2.jpg",
334
+ "Pup3": "pup3.jpg",
335
+ "Pup4": "pup4.jpeg",
336
+ "Pup5": "pup5.jpg",
337
+ },
338
+ }
339
+
340
+
341
+ image_examples_tile_size = 50
342
+
343
+ with gr.Blocks() as demo:
344
+ with gr.Row():
345
+ with gr.Column(scale=5):
346
+ gr.Markdown(
347
+ """
348
+ # Soho-Clip
349
+
350
+ A tool for exploring CLIP embedding spaces.
351
+
352
+ Try uploading a few images and/or add some text prompts and click generate images.
353
+ """)
354
+ with gr.Column(scale=2, min_width=(tile_size+20)*3):
355
+ with gr.Row():
356
+ with gr.Column(scale=1, min_width=tile_size):
357
+ gr.Markdown("## Input 1")
358
+ with gr.Column(scale=1, min_width=tile_size):
359
+ gr.Markdown("## Input 2")
360
+ with gr.Column(scale=1, min_width=tile_size):
361
+ gr.Markdown("## Generates:")
362
+ for example in examples:
363
+ with gr.Row():
364
+ for example in example:
365
+ with gr.Column(scale=1, min_width=tile_size):
366
+ local_path = os.path.join(image_folder, example)
367
+ gr.Image(
368
+ value = local_path, shape=(tile_size,tile_size),
369
+ show_label=False, interactive=False) \
370
+ .style(height=tile_size, width=tile_size)
371
+
372
+ with gr.Row():
373
+ for i in range(max_tabs):
374
+ with gr.Tab(f"Input {i+1}"):
375
+ with gr.Row():
376
+ with gr.Column(scale=1, min_width=240):
377
+ input_images[i] = gr.Image(label="Image Prompt", show_label=True)
378
+ with gr.Column(scale=3, min_width=600):
379
+ embedding_plots[i] = gr.LinePlot(show_label=False).style(container=False)
380
+ # input_image.change(on_image_load, inputs= [input_image, plot])
381
+ with gr.Row():
382
+ with gr.Column(scale=2, min_width=240):
383
+ input_prompts[i] = gr.Textbox(label="Text Prompt", show_label=True)
384
+ with gr.Column(scale=3, min_width=600):
385
+ with gr.Row():
386
+ # with gr.Slider(min=-5, max=5, value=1, label="Power", show_label=True):
387
+ # embedding_powers[i] = gr.Slider.value
388
+ embedding_powers[i] = gr.Slider(minimum=-3, maximum=3, value=1, label="Power", show_label=True, interactive=True)
389
+ with gr.Row():
390
+ with gr.Accordion(f"Embeddings (base64)", open=False):
391
+ embedding_base64s[i] = gr.Textbox(show_label=False)
392
+ for idx, (tab_title, examples) in enumerate(tabbed_examples.items()):
393
+ with gr.Tab(tab_title):
394
+ with gr.Row():
395
+ for idx, (title, example) in enumerate(examples.items()):
396
+ if example.endswith(".jpg") or example.endswith(".jpeg"):
397
+ # add image example
398
+ local_path = os.path.join(image_folder, example)
399
+ with gr.Column(scale=1, min_width=image_examples_tile_size):
400
+ gr.Examples(
401
+ examples=[local_path],
402
+ inputs=input_images[i],
403
+ label=title,
404
+ )
405
+ else:
406
+ # add text example
407
+ with gr.Column(scale=1, min_width=image_examples_tile_size*2):
408
+ gr.Examples(
409
+ examples=[example],
410
+ inputs=input_prompts[i],
411
+ label=title,
412
+ )
413
+
414
+ with gr.Row():
415
+ average_embedding_plot = gr.LinePlot(show_label=True, label="Average Embeddings (base64)").style(container=False)
416
+ with gr.Row():
417
+ with gr.Accordion(f"Avergage embeddings in base 64", open=False):
418
+ average_embedding_base64 = gr.Textbox(show_label=False)
419
+ with gr.Row():
420
+ submit = gr.Button("Generate images")
421
+ with gr.Row():
422
+ with gr.Column(scale=1, min_width=200):
423
+ scale = gr.Slider(0, 25, value=3, step=1, label="Guidance scale")
424
+ with gr.Column(scale=1, min_width=200):
425
+ n_samples = gr.Slider(1, 4, value=1, step=1, label="Number images")
426
+ with gr.Column(scale=1, min_width=200):
427
+ steps = gr.Slider(5, 50, value=25, step=5, label="Steps")
428
+ with gr.Column(scale=1, min_width=200):
429
+ seed = gr.Number(None, label="Seed (blank = random)", precision=0)
430
+ with gr.Row():
431
+ output = gr.Gallery(label="Generated variations")
432
+
433
+ embedding_base64s_state = gr.State(value=[None for i in range(max_tabs)])
434
+ embedding_power_state = gr.State(value=[1. for i in range(max_tabs)])
435
+ for i in range(max_tabs):
436
+ input_images[i].change(on_image_load_update_embeddings, input_images[i], [embedding_base64s[i]])
437
+ input_prompts[i].change(on_prompt_change_update_embeddings, input_prompts[i], [embedding_base64s[i]])
438
+ embedding_base64s[i].change(on_embeddings_changed_update_plot, embedding_base64s[i], [embedding_plots[i]])
439
+ idx_state = gr.State(value=i)
440
+ embedding_base64s[i].change(on_embeddings_changed_update_average_embeddings, [embedding_base64s_state, embedding_power_state, embedding_base64s[i], idx_state], average_embedding_base64)
441
+ embedding_powers[i].change(on_power_change_update_average_embeddings, [embedding_base64s_state, embedding_power_state, embedding_powers[i], idx_state], average_embedding_base64)
442
+
443
+ average_embedding_base64.change(on_embeddings_changed_update_plot, average_embedding_base64, average_embedding_plot)
444
+
445
+ # submit.click(main, inputs= [embedding_base64s[0], scale, n_samples, steps, seed], outputs=output)
446
+ submit.click(main, inputs= [average_embedding_base64, scale, n_samples, steps, seed], outputs=output)
447
+ output.style(grid=2)
448
+
449
+ with gr.Row():
450
+ gr.Markdown(
451
+ """
452
+ My interest is to use CLIP for image/video understanding (see [CLIP_visual-spatial-reasoning](https://github.com/Sohojoe/CLIP_visual-spatial-reasoning).)
453
+
454
+
455
+ ### Initial Features
456
+
457
+ - Combine up to 10 Images and/or text inputs to create an average embedding space.
458
+ - View embedding spaces as graph
459
+ - Generate a new image based on the average embedding space
460
+
461
+ ### Known limitations
462
+
463
+ - Text input is a little off (requires fine tuning and I'm having issues with that at the moment)
464
+ - It can only generate a single image at a time
465
+ - Not easy to use the sample images
466
+
467
+ ### Acknowledgements
468
+
469
+ - I heavily build on Justin Pinkney's [Experiments in Image Variation](https://www.justinpinkney.com/image-variation-experiments). Please credit them if you use this work.
470
+ - [CLIP](https://openai.com/blog/clip/)
471
+ - [Stable Diffusion](https://github.com/CompVis/stable-diffusion)
472
+
473
+ """)
474
+
475
+ # ![Alt Text](file/pup1.jpg)
476
+
477
+ # <img src="file/pup1.jpg" width="100" height="100">
478
+
479
+ # ![Alt Text](file/pup1.jpg){height=100 width=100}
480
+
481
+ if __name__ == "__main__":
482
+ demo.launch()
images/371739.jpeg ADDED
images/452650.jpeg ADDED
images/540554.jpeg ADDED
images/557922.jpeg ADDED
images/Anya Taylor-Joy 003.jpg ADDED
images/ColorWheel001 BW.jpg ADDED
images/ColorWheel001.jpg ADDED
images/ColorWheel002 BW.jpg ADDED
images/ColorWheel002.jpg ADDED
images/Donkey.jpg ADDED
images/Lizzo 001.jpeg ADDED
images/Mirai.jpg ADDED
images/OnChainMonkey #2278.jpeg ADDED
images/OnChainMonkey-2278.jpg ADDED
images/Ray-Liotta-Goodfellas.jpg ADDED
images/Snoop Dogg.jpg ADDED
images/SohoJoeEth + Donkey.jpeg ADDED
images/SohoJoeEth + Ray.jpeg ADDED
images/SohoJoeEth + Snoop Dogg.jpeg ADDED
images/SohoJoeEth.jpeg ADDED
images/Wassie 4498.jpeg ADDED
images/billie eilish 004.jpeg ADDED
images/pup1.jpg ADDED
images/pup2.jpg ADDED
images/pup3.jpg ADDED
images/pup4.jpeg ADDED
images/pup5.jpg ADDED
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu113
2
+ torch
3
+ torchvision
4
+ numpy
5
+ transformers
6
+ # diffusers
7
+ # ftfy
8
+ gradio
9
+ accelerate
10
+ clip-retrieval