fffiloni commited on
Commit
d18abca
1 Parent(s): dc3228c

SD-XL max tokens optimization (with compel)

Browse files
Files changed (1) hide show
  1. app.py +18 -4
app.py CHANGED
@@ -8,11 +8,23 @@ from gradio_client import Client
8
 
9
  client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
10
 
 
11
  from diffusers import DiffusionPipeline
12
  import torch
13
 
14
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
 
 
 
15
  pipe.to("cuda")
 
 
 
 
 
 
 
 
16
  #pipe.enable_model_cpu_offload()
17
 
18
  # if using torch < 2.0
@@ -79,9 +91,7 @@ def infer(audio_file):
79
  I'll give you a music description, from i want you to provide an illustrative image description that would fit well with the music.
80
  Do not processs each segment or song, but provide a summary for the whole instead.
81
  Answer with only one image description. Never do lists. Maximum 77 tokens.
82
-
83
  Here's the music description :
84
-
85
  {cap_result}
86
 
87
  """
@@ -95,7 +105,11 @@ def infer(audio_file):
95
 
96
  print(f"Llama2 result: {result}")
97
 
98
- images = pipe(prompt=result).images[0]
 
 
 
 
99
 
100
  print("Finished")
101
 
 
8
 
9
  client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
10
 
11
+ from compel import Compel, ReturnedEmbeddingsType
12
  from diffusers import DiffusionPipeline
13
  import torch
14
 
15
+ pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0",
16
+ torch_dtype=torch.float16,
17
+ use_safetensors=True,
18
+ variant="fp16")
19
  pipe.to("cuda")
20
+
21
+ compel = Compel(
22
+ tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
23
+ text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
24
+ returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
25
+ requires_pooled=[False, True]
26
+ )
27
+
28
  #pipe.enable_model_cpu_offload()
29
 
30
  # if using torch < 2.0
 
91
  I'll give you a music description, from i want you to provide an illustrative image description that would fit well with the music.
92
  Do not processs each segment or song, but provide a summary for the whole instead.
93
  Answer with only one image description. Never do lists. Maximum 77 tokens.
 
94
  Here's the music description :
 
95
  {cap_result}
96
 
97
  """
 
105
 
106
  print(f"Llama2 result: {result}")
107
 
108
+ # ———
109
+
110
+ prompt = result
111
+ conditioning, pooled = compel(prompt)
112
+ images = pipe(prompt_embeds=conditioning, pooled_prompt_embeds=pooled).images[0]
113
 
114
  print("Finished")
115