Spaces:

AmitIsraeli
/

PopYou

Sleeping

App Files Files Community

AmitIsraeli commited on 28 days ago

Commit

f6d4208

•

1 Parent(s): 8d1279d

add explanation

Browse files

Files changed (3) hide show

.DS_Store +0 -0
VAR_explained.png +0 -0
app.py +102 -27

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

VAR_explained.png ADDED Viewed

app.py CHANGED Viewed

@@ -35,9 +35,9 @@ class SimpleAdapter(nn.Module):
         x = self.norm2(x)
         return x
-class InrenceTextVAR(nn.Module):
     def __init__(self, pl_checkpoint=None, start_class_id=578, hugging_face_token=None, siglip_model='google/siglip-base-patch16-224', device="cpu", MODEL_DEPTH=16):
-        super(InrenceTextVAR, self).__init__()
         self.device = device
         self.class_id = start_class_id
         # Define layers
@@ -117,12 +117,10 @@ if __name__ == '__main__':
     # Initialize the model
     checkpoint = 'VARtext_v1.pth'  # Replace with your actual checkpoint path
     device = 'cpu' if not torch.cuda.is_available() else 'cuda'
-    state_dict = torch.load(checkpoint, map_location="cpu")
-    model = InrenceTextVAR(device=device)
-    model.load_state_dict(state_dict)
     model.to(device)
     def generate_image_gradio(text, beta=1.0, seed=None, more_smooth=False, top_k=0, top_p=0.9):
         print(f"Generating image for text: {text}\n"
               f"beta: {beta}\n"
@@ -133,34 +131,111 @@ if __name__ == '__main__':
         image = model.generate_image(text, beta=beta, seed=seed, more_smooth=more_smooth, top_k=int(top_k), top_p=top_p)
         return image
-    with gr.Blocks() as demo:
-        gr.Markdown("# PopYou2-VAR")
         with gr.Tab("Generate Image"):
-            text_input = gr.Textbox(label="Input Text")
-            beta_input = gr.Slider(label="Beta", minimum=0.0, maximum=2.5, step=0.05, value=1.0)
-            seed_input = gr.Number(label="Seed", value=None)
-            more_smooth_input = gr.Checkbox(label="More Smooth", value=False)
-            top_k_input = gr.Number(label="Top K", value=0)
-            top_p_input = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, step=0.01, value=0.9)
-            generate_button = gr.Button("Generate Image")
-            image_output = gr.Image(label="Generated Image")
             generate_button.click(
                 generate_image_gradio,
                 inputs=[text_input, beta_input, seed_input, more_smooth_input, top_k_input, top_p_input],
                 outputs=image_output
             )
-        gr.Markdown("### Examples")
-        with gr.Row():
-            example1_text = gr.Textbox(label="Example 1", value="a funko pop figure of a yellow robot tom cruise with headphones on a white background", interactive=False)
-            example1_image = gr.Image(label="Generated Image 1", value="examples/tom_cruise_robot.png")  # Replace with the actual path
-        with gr.Row():
-            example2_text = gr.Textbox(label="Example 2", value="a funko pop figure of a alien Scarlett Johansson holding a shield on a white background", interactive=False)
-            example2_image = gr.Image(label="Generated Image 2", value="examples/alien_Scarlett_Johansson.png")  # Replace with the actual path
-        with gr.Row():
-            example3_text = gr.Textbox(label="Example 3", value="a funko pop figure of a woman with a hat and a pink long hair and blue dress on a white background", interactive=False)
-            example3_image = gr.Image(label="Generated Image 3", value="examples/woman_pink.png")  # Replace with the actual path
-    demo.launch()

         x = self.norm2(x)
         return x
+class InferenceTextVAR(nn.Module):
     def __init__(self, pl_checkpoint=None, start_class_id=578, hugging_face_token=None, siglip_model='google/siglip-base-patch16-224', device="cpu", MODEL_DEPTH=16):
+        super(InferenceTextVAR, self).__init__()
         self.device = device
         self.class_id = start_class_id
         # Define layers
     # Initialize the model
     checkpoint = 'VARtext_v1.pth'  # Replace with your actual checkpoint path
     device = 'cpu' if not torch.cuda.is_available() else 'cuda'
+    model = InferenceTextVAR(device=device)
+    model.load_state_dict(torch.load(checkpoint, map_location=device))
     model.to(device)
     def generate_image_gradio(text, beta=1.0, seed=None, more_smooth=False, top_k=0, top_p=0.9):
         print(f"Generating image for text: {text}\n"
               f"beta: {beta}\n"
         image = model.generate_image(text, beta=beta, seed=seed, more_smooth=more_smooth, top_k=int(top_k), top_p=top_p)
         return image
+    with gr.Blocks(css="""
+    .project-item {margin-bottom: 30px;}
+    .project-tags .tag {display: inline-block; background-color: #e0e0e0; padding: 5px 10px; margin-right: 5px; border-radius: 5px;}
+    .project-description {margin-top: 20px;}
+    .github-button, .huggingface-button, .wandb-button {
+        display: inline-block; margin-left: 10px; text-decoration: none; font-size: 14px;
+        padding: 5px 10px; background-color: #f0f0f0; border-radius: 5px; color: black;
+    }
+    .project-content {display: flex; flex-direction: row;}
+    .project-description {flex: 2; padding-right: 20px;}
+    .project-options-image {flex: 1;}
+    .funko-image {width: 100%; max-width: 300px;}
+    """) as demo:
+        gr.Markdown("""
+        # PopYou2 - VAR Text
+        <!-- Project Links -->
+        [![GitHub](https://img.shields.io/badge/GitHub-Repository-blue?logo=github)](https://github.com/amit154154/VAR_clip)
+        [![Weights & Biases](https://img.shields.io/badge/Weights%20%26%20Biases-Report-orange?logo=weightsandbiases)](https://api.wandb.ai/links/amit154154/cqccmfsl)
+        **Tags:** Image Generation, GAN
+        ## Project Explanation
+        - **Dataset Generation:** Generated a comprehensive dataset of approximately 100,000 Funko Pop! images with detailed prompts using [SDXL Turbo](https://huggingface.co/stabilityai/sdxl-turbo) for high-quality data creation.
+        - **Model Fine-tuning:** Fine-tuned the [Visual AutoRegressive (VAR)](https://arxiv.org/abs/2404.02905) model, pretrained on ImageNet, to adapt it for Funko Pop! generation by injecting a custom embedding representing the "doll" class.
+        - **Adapter Training:** Trained an adapter with the frozen [SigLIP image encoder](https://github.com/FoundationVision/VAR) and a lightweight LoRA module to map image embeddings to text representation in a large language model.
+        - **Text-to-Image Generation:** Enabled text-to-image generation by replacing the SigLIP image encoder with its text encoder, retaining frozen components such as the VAE and generator for efficiency and quality.
+        ![VAR Explained](VAR_explained.png)
+        ## Generate Your Own Funko Pop!
+        """)
         with gr.Tab("Generate Image"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    text_input = gr.Textbox(label="Input Text", placeholder="Enter a description for your Funko Pop!")
+                    beta_input = gr.Slider(label="Beta", minimum=0.0, maximum=2.5, step=0.05, value=1.0)
+                    seed_input = gr.Number(label="Seed", value=None)
+                    more_smooth_input = gr.Checkbox(label="More Smooth", value=False)
+                    top_k_input = gr.Number(label="Top K", value=0)
+                    top_p_input = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, step=0.01, value=0.5)
+                    generate_button = gr.Button("Generate Image")
+                with gr.Column(scale=1):
+                    image_output = gr.Image(label="Generated Image")
             generate_button.click(
                 generate_image_gradio,
                 inputs=[text_input, beta_input, seed_input, more_smooth_input, top_k_input, top_p_input],
                 outputs=image_output
             )
+        gr.Markdown("## Examples")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Example 1")
+                gr.Markdown("A Funko Pop figure of a yellow robot Tom Cruise with headphones on a white background")
+                example1_image = gr.Image(value="examples/tom_cruise_robot.png")  # Replace with the actual path
+            with gr.Column():
+                gr.Markdown("### Example 2")
+                gr.Markdown("A Funko Pop figure of an alien Scarlett Johansson holding a shield on a white background")
+                example2_image = gr.Image(value="examples/alien_Scarlett_Johansson.png")  # Replace with the actual path
+            with gr.Column():
+                gr.Markdown("### Example 3")
+                gr.Markdown("A Funko Pop figure of a woman with a hat and pink long hair and blue dress on a white background")
+                example3_image = gr.Image(value="examples/woman_pink.png")  # Replace with the actual path
+        gr.Markdown("""
+        ## Customize Your Funko Pop!
+        Build your own Funko Pop! by selecting options below and clicking "Generate Custom Funko Pop!".
+        """)
+        def update_custom_image(famous_name, character, action):
+            # Build the prompt based on the selections
+            parts = []
+            if famous_name != "None":
+                parts.append(f"a Funko Pop figure of {famous_name}")
+            else:
+                parts.append("a Funko Pop figure")
+            if character != "None":
+                parts.append(f"styled as a {character}")
+            if action != "None":
+                parts.append(f"performing {action}")
+            parts.append("on a white background")
+            prompt = ", ".join(parts)
+            image = model.generate_image(prompt)
+            return image
+        famous_name_input = gr.Dropdown(choices=["None", "Donald Trump", "Johnny Depp", "Oprah Winfrey"], label="Famous Name", value="None")
+        character_input = gr.Dropdown(choices=["None", "Alien", "Robot"], label="Character", value="None")
+        action_input = gr.Dropdown(choices=["None", "Playing the Guitar", "Holding the Sword"], label="Action", value="None")
+        custom_generate_button = gr.Button("Generate Custom Funko Pop!")
+        custom_image_output = gr.Image(label="Custom Funko Pop!")
+        custom_generate_button.click(
+            update_custom_image,
+            inputs=[famous_name_input, character_input, action_input],
+            outputs=custom_image_output
+        )
+    demo.launch()