AmitIsraeli commited on
Commit
f6d4208
1 Parent(s): 8d1279d

add explanation

Browse files
Files changed (3) hide show
  1. .DS_Store +0 -0
  2. VAR_explained.png +0 -0
  3. app.py +102 -27
.DS_Store ADDED
Binary file (6.15 kB). View file
 
VAR_explained.png ADDED
app.py CHANGED
@@ -35,9 +35,9 @@ class SimpleAdapter(nn.Module):
35
  x = self.norm2(x)
36
  return x
37
 
38
- class InrenceTextVAR(nn.Module):
39
  def __init__(self, pl_checkpoint=None, start_class_id=578, hugging_face_token=None, siglip_model='google/siglip-base-patch16-224', device="cpu", MODEL_DEPTH=16):
40
- super(InrenceTextVAR, self).__init__()
41
  self.device = device
42
  self.class_id = start_class_id
43
  # Define layers
@@ -117,12 +117,10 @@ if __name__ == '__main__':
117
  # Initialize the model
118
  checkpoint = 'VARtext_v1.pth' # Replace with your actual checkpoint path
119
  device = 'cpu' if not torch.cuda.is_available() else 'cuda'
120
- state_dict = torch.load(checkpoint, map_location="cpu")
121
- model = InrenceTextVAR(device=device)
122
- model.load_state_dict(state_dict)
123
  model.to(device)
124
 
125
-
126
  def generate_image_gradio(text, beta=1.0, seed=None, more_smooth=False, top_k=0, top_p=0.9):
127
  print(f"Generating image for text: {text}\n"
128
  f"beta: {beta}\n"
@@ -133,34 +131,111 @@ if __name__ == '__main__':
133
  image = model.generate_image(text, beta=beta, seed=seed, more_smooth=more_smooth, top_k=int(top_k), top_p=top_p)
134
  return image
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- with gr.Blocks() as demo:
138
- gr.Markdown("# PopYou2-VAR")
139
  with gr.Tab("Generate Image"):
140
- text_input = gr.Textbox(label="Input Text")
141
- beta_input = gr.Slider(label="Beta", minimum=0.0, maximum=2.5, step=0.05, value=1.0)
142
- seed_input = gr.Number(label="Seed", value=None)
143
- more_smooth_input = gr.Checkbox(label="More Smooth", value=False)
144
- top_k_input = gr.Number(label="Top K", value=0)
145
- top_p_input = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, step=0.01, value=0.9)
146
- generate_button = gr.Button("Generate Image")
147
- image_output = gr.Image(label="Generated Image")
 
 
 
 
148
  generate_button.click(
149
  generate_image_gradio,
150
  inputs=[text_input, beta_input, seed_input, more_smooth_input, top_k_input, top_p_input],
151
  outputs=image_output
152
  )
153
 
154
- gr.Markdown("### Examples")
155
- with gr.Row():
156
- example1_text = gr.Textbox(label="Example 1", value="a funko pop figure of a yellow robot tom cruise with headphones on a white background", interactive=False)
157
- example1_image = gr.Image(label="Generated Image 1", value="examples/tom_cruise_robot.png") # Replace with the actual path
158
- with gr.Row():
159
- example2_text = gr.Textbox(label="Example 2", value="a funko pop figure of a alien Scarlett Johansson holding a shield on a white background", interactive=False)
160
- example2_image = gr.Image(label="Generated Image 2", value="examples/alien_Scarlett_Johansson.png") # Replace with the actual path
161
- with gr.Row():
162
- example3_text = gr.Textbox(label="Example 3", value="a funko pop figure of a woman with a hat and a pink long hair and blue dress on a white background", interactive=False)
163
- example3_image = gr.Image(label="Generated Image 3", value="examples/woman_pink.png") # Replace with the actual path
164
 
165
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
 
 
35
  x = self.norm2(x)
36
  return x
37
 
38
+ class InferenceTextVAR(nn.Module):
39
  def __init__(self, pl_checkpoint=None, start_class_id=578, hugging_face_token=None, siglip_model='google/siglip-base-patch16-224', device="cpu", MODEL_DEPTH=16):
40
+ super(InferenceTextVAR, self).__init__()
41
  self.device = device
42
  self.class_id = start_class_id
43
  # Define layers
 
117
  # Initialize the model
118
  checkpoint = 'VARtext_v1.pth' # Replace with your actual checkpoint path
119
  device = 'cpu' if not torch.cuda.is_available() else 'cuda'
120
+ model = InferenceTextVAR(device=device)
121
+ model.load_state_dict(torch.load(checkpoint, map_location=device))
 
122
  model.to(device)
123
 
 
124
  def generate_image_gradio(text, beta=1.0, seed=None, more_smooth=False, top_k=0, top_p=0.9):
125
  print(f"Generating image for text: {text}\n"
126
  f"beta: {beta}\n"
 
131
  image = model.generate_image(text, beta=beta, seed=seed, more_smooth=more_smooth, top_k=int(top_k), top_p=top_p)
132
  return image
133
 
134
+ with gr.Blocks(css="""
135
+ .project-item {margin-bottom: 30px;}
136
+ .project-tags .tag {display: inline-block; background-color: #e0e0e0; padding: 5px 10px; margin-right: 5px; border-radius: 5px;}
137
+ .project-description {margin-top: 20px;}
138
+ .github-button, .huggingface-button, .wandb-button {
139
+ display: inline-block; margin-left: 10px; text-decoration: none; font-size: 14px;
140
+ padding: 5px 10px; background-color: #f0f0f0; border-radius: 5px; color: black;
141
+ }
142
+ .project-content {display: flex; flex-direction: row;}
143
+ .project-description {flex: 2; padding-right: 20px;}
144
+ .project-options-image {flex: 1;}
145
+ .funko-image {width: 100%; max-width: 300px;}
146
+ """) as demo:
147
+ gr.Markdown("""
148
+ # PopYou2 - VAR Text
149
+
150
+ <!-- Project Links -->
151
+ [![GitHub](https://img.shields.io/badge/GitHub-Repository-blue?logo=github)](https://github.com/amit154154/VAR_clip)
152
+ [![Weights & Biases](https://img.shields.io/badge/Weights%20%26%20Biases-Report-orange?logo=weightsandbiases)](https://api.wandb.ai/links/amit154154/cqccmfsl)
153
+
154
+ **Tags:** Image Generation, GAN
155
+
156
+ ## Project Explanation
157
+
158
+ - **Dataset Generation:** Generated a comprehensive dataset of approximately 100,000 Funko Pop! images with detailed prompts using [SDXL Turbo](https://huggingface.co/stabilityai/sdxl-turbo) for high-quality data creation.
159
+ - **Model Fine-tuning:** Fine-tuned the [Visual AutoRegressive (VAR)](https://arxiv.org/abs/2404.02905) model, pretrained on ImageNet, to adapt it for Funko Pop! generation by injecting a custom embedding representing the "doll" class.
160
+ - **Adapter Training:** Trained an adapter with the frozen [SigLIP image encoder](https://github.com/FoundationVision/VAR) and a lightweight LoRA module to map image embeddings to text representation in a large language model.
161
+ - **Text-to-Image Generation:** Enabled text-to-image generation by replacing the SigLIP image encoder with its text encoder, retaining frozen components such as the VAE and generator for efficiency and quality.
162
+
163
+ ![VAR Explained](VAR_explained.png)
164
+
165
+
166
+ ## Generate Your Own Funko Pop!
167
+ """)
168
 
 
 
169
  with gr.Tab("Generate Image"):
170
+ with gr.Row():
171
+ with gr.Column(scale=1):
172
+ text_input = gr.Textbox(label="Input Text", placeholder="Enter a description for your Funko Pop!")
173
+ beta_input = gr.Slider(label="Beta", minimum=0.0, maximum=2.5, step=0.05, value=1.0)
174
+ seed_input = gr.Number(label="Seed", value=None)
175
+ more_smooth_input = gr.Checkbox(label="More Smooth", value=False)
176
+ top_k_input = gr.Number(label="Top K", value=0)
177
+ top_p_input = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, step=0.01, value=0.5)
178
+ generate_button = gr.Button("Generate Image")
179
+ with gr.Column(scale=1):
180
+ image_output = gr.Image(label="Generated Image")
181
+
182
  generate_button.click(
183
  generate_image_gradio,
184
  inputs=[text_input, beta_input, seed_input, more_smooth_input, top_k_input, top_p_input],
185
  outputs=image_output
186
  )
187
 
188
+ gr.Markdown("## Examples")
 
 
 
 
 
 
 
 
 
189
 
190
+ with gr.Row():
191
+ with gr.Column():
192
+ gr.Markdown("### Example 1")
193
+ gr.Markdown("A Funko Pop figure of a yellow robot Tom Cruise with headphones on a white background")
194
+ example1_image = gr.Image(value="examples/tom_cruise_robot.png") # Replace with the actual path
195
+
196
+ with gr.Column():
197
+ gr.Markdown("### Example 2")
198
+ gr.Markdown("A Funko Pop figure of an alien Scarlett Johansson holding a shield on a white background")
199
+ example2_image = gr.Image(value="examples/alien_Scarlett_Johansson.png") # Replace with the actual path
200
+
201
+ with gr.Column():
202
+ gr.Markdown("### Example 3")
203
+ gr.Markdown("A Funko Pop figure of a woman with a hat and pink long hair and blue dress on a white background")
204
+ example3_image = gr.Image(value="examples/woman_pink.png") # Replace with the actual path
205
+
206
+ gr.Markdown("""
207
+ ## Customize Your Funko Pop!
208
+
209
+ Build your own Funko Pop! by selecting options below and clicking "Generate Custom Funko Pop!".
210
+
211
+ """)
212
+
213
+ def update_custom_image(famous_name, character, action):
214
+ # Build the prompt based on the selections
215
+ parts = []
216
+ if famous_name != "None":
217
+ parts.append(f"a Funko Pop figure of {famous_name}")
218
+ else:
219
+ parts.append("a Funko Pop figure")
220
+ if character != "None":
221
+ parts.append(f"styled as a {character}")
222
+ if action != "None":
223
+ parts.append(f"performing {action}")
224
+ parts.append("on a white background")
225
+ prompt = ", ".join(parts)
226
+ image = model.generate_image(prompt)
227
+ return image
228
+
229
+ famous_name_input = gr.Dropdown(choices=["None", "Donald Trump", "Johnny Depp", "Oprah Winfrey"], label="Famous Name", value="None")
230
+ character_input = gr.Dropdown(choices=["None", "Alien", "Robot"], label="Character", value="None")
231
+ action_input = gr.Dropdown(choices=["None", "Playing the Guitar", "Holding the Sword"], label="Action", value="None")
232
+ custom_generate_button = gr.Button("Generate Custom Funko Pop!")
233
+ custom_image_output = gr.Image(label="Custom Funko Pop!")
234
+
235
+ custom_generate_button.click(
236
+ update_custom_image,
237
+ inputs=[famous_name_input, character_input, action_input],
238
+ outputs=custom_image_output
239
+ )
240
 
241
+ demo.launch()