Spaces:

ECLIPSE-Community
/

lambda-eclipse-personalized-t2i

Running on Zero

App Files Files Community

mpatel57 commited on Feb 8, 2024

Commit

a5eed04

verified ·

1 Parent(s): 6d0ad4a

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -99

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from transformers import (
     CLIPTextModelWithProjection,
     CLIPVisionModelWithProjection,
     CLIPImageProcessor,
-    CLIPTokenizer
 )
 from transformers import CLIPTokenizer
@@ -33,10 +33,11 @@ if torch.cuda.is_available():
     __device__ = "cuda"
     __dtype__ = torch.float16
 class Model:
     def __init__(self):
         self.device = __device__
         self.text_encoder = (
             CLIPTextModelWithProjection.from_pretrained(
                 "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
@@ -65,102 +66,48 @@ class Model:
         self.pipe = DiffusionPipeline.from_pretrained(
             "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=__dtype__
         ).to(self.device)
-    def inference(self, raw_data):
         image_emb, negative_image_emb = self.pipe_prior(
             raw_data=raw_data,
         ).to_tuple()
         image = self.pipe(
             image_embeds=image_emb,
             negative_image_embeds=negative_image_emb,
             num_inference_steps=50,
-            guidance_scale=4.0,
         ).images[0]
         return image
-    def process_data(self,
-                     image: PIL.Image.Image,
-                     keyword: str,
-                     image2: PIL.Image.Image,
-                     keyword2: str,
-                     text: str,
-                     ) -> dict[str, Any]:
-        print(f"keyword : {keyword}, keyword2 : {keyword2}, prompt : {text}")
-        device = torch.device(self.device)
-        data: dict[str, Any] = {}
-        data['text'] = text
-        txt = self.tokenizer(
-            text,
-            padding='max_length',
-            truncation=True,
-            return_tensors='pt',
-        )
-        txt_items = {k: v.to(device) for k, v in txt.items()}
-        new_feats = self.text_encoder(**txt_items)
-        new_last_hidden_states = new_feats.last_hidden_state[0].cpu().numpy()
-        plt.imshow(image)
-        plt.title('image')
-        plt.savefig('image_testt2.png')
-        plt.show()
-        mask_img = self.image_processor(image, return_tensors="pt").to(__device__)
-        vision_feats = self.vision_encoder(
-            **mask_img
-        ).image_embeds
-        entity_tokens = self.tokenizer(keyword)["input_ids"][1:-1]
-        for tid in entity_tokens:
-            indices = np.where(txt_items["input_ids"][0].cpu().numpy() == tid)[0]
-            new_last_hidden_states[indices] = vision_feats[0].cpu().numpy()
-            print(indices)
-        if image2 is not None:
-            mask_img2 = self.image_processor(image2, return_tensors="pt").to(__device__)
-            vision_feats2 = self.vision_encoder(
-                **mask_img2
-            ).image_embeds
-            if keyword2 is not None:
-                entity_tokens = self.tokenizer(keyword2)["input_ids"][1:-1]
-                for tid in entity_tokens:
-                    indices = np.where(txt_items["input_ids"][0].cpu().numpy() == tid)[0]
-                    new_last_hidden_states[indices] = vision_feats2[0].cpu().numpy()
-                    print(indices)
-        text_feats = {
-            "prompt_embeds": new_feats.text_embeds.to(__device__),
-            "text_encoder_hidden_states": torch.tensor(new_last_hidden_states).unsqueeze(0).to(__device__),
-            "text_mask": txt_items["attention_mask"].to(__device__),
-        }
-        return text_feats
-    def run(self,
-            image: dict[str, PIL.Image.Image],
-            keyword: str,
-            image2: dict[str, PIL.Image.Image],
-            keyword2: str,
-            text: str,
-            ):
-        # aug_feats = self.process_data(image["composite"], keyword, image2["composite"], keyword2, text)
         sub_imgs = [image["composite"]]
-        if image2:
-            sub_imgs.append(image2["composite"])
         sun_keywords = [keyword]
-        if keyword2:
             sun_keywords.append(keyword2)
         raw_data = {
             "prompt": text,
             "subject_images": sub_imgs,
-            "subject_keywords": sun_keywords
         }
-        image = self.inference(raw_data)
         return image
-def create_demo():
-    USAGE = '''## To run the demo, you should:
     1. Upload your image.
     2. <span style='color: red;'>**Upload a masked subject image with white blankspace or whiten out manually using brush tool.**
     3. Input a Keyword i.e. 'Dog'
@@ -169,7 +116,7 @@ def create_demo():
      4-2. Input the Keyword i.e. 'Sunglasses'
     3. Input proper text prompts, such as "A photo of Dog" or "A Dog wearing sunglasses", Please use the same keyword in the prompt.
     4. Click the Run button.
-    '''
     model = Model()
@@ -180,6 +127,8 @@ def create_demo():
             <p style="text-align: center; color: red;">This demo is currently hosted on either a small GPU or CPU. We will soon provide high-end GPU support.</p>
             <p style="text-align: center; color: red;">Please follow the instructions from here to run it locally: <a href="https://github.com/eclipse-t2i/lambda-eclipse-inference">GitHub Inference Code</a></p>
             """
         )
         gr.Markdown(USAGE)
@@ -187,28 +136,41 @@ def create_demo():
             with gr.Column():
                 with gr.Group():
                     gr.Markdown(
-                        'Upload your first masked subject image or mask out marginal space')
-                    image = gr.ImageEditor(label='Input', type='pil', brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"))
                     keyword = gr.Text(
-                        label='Keyword',
                         placeholder='e.g. "Dog", "Goofie"',
-                        info='Keyword for first subject')
                     gr.Markdown(
-                        'For Multi-Subject generation : Upload your second masked subject image or mask out marginal space')
-                    image2 = gr.ImageEditor(label='Input', type='pil', brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"))
-                    keyword2= gr.Text(
-                        label='Keyword',
                         placeholder='e.g. "Sunglasses", "Grand Canyon"',
-                        info='Keyword for second subject')
                     prompt = gr.Text(
-                        label='Prompt',
                         placeholder='e.g. "A photo of dog", "A dog wearing sunglasses"',
-                        info='Keep the keywords used previously in the prompt')
-                run_button = gr.Button('Run')
             with gr.Column():
-                result = gr.Image(label='Result')
         inputs = [
             image,
@@ -217,18 +179,77 @@ def create_demo():
             keyword2,
             prompt,
         ]
         gr.Examples(
-            examples=[[os.path.join(os.path.dirname(__file__), "./assets/cat.png"), "cat", os.path.join(os.path.dirname(__file__), "./assets/blue_sunglasses.png"), "glasses", "A cat wearing glasses on a snowy field"]],
-            inputs = inputs,
             fn=model.run,
             outputs=result,
         )
         run_button.click(fn=model.run, inputs=inputs, outputs=result)
     return demo
-if __name__ == '__main__':
     demo = create_demo()
-    demo.queue(max_size=20).launch()

     CLIPTextModelWithProjection,
     CLIPVisionModelWithProjection,
     CLIPImageProcessor,
+    CLIPTokenizer,
 )
 from transformers import CLIPTokenizer
     __device__ = "cuda"
     __dtype__ = torch.float16
 class Model:
     def __init__(self):
         self.device = __device__
         self.text_encoder = (
             CLIPTextModelWithProjection.from_pretrained(
                 "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
         self.pipe = DiffusionPipeline.from_pretrained(
             "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=__dtype__
         ).to(self.device)
+    def inference(self, raw_data, seed):
+        generator = torch.Generator(device="cuda").manual_seed(seed)
         image_emb, negative_image_emb = self.pipe_prior(
             raw_data=raw_data,
+            generator=generator,
         ).to_tuple()
         image = self.pipe(
             image_embeds=image_emb,
             negative_image_embeds=negative_image_emb,
             num_inference_steps=50,
+            guidance_scale=7.5,
+            generator=generator,
         ).images[0]
         return image
+    def run(
+        self,
+        image: dict[str, PIL.Image.Image],
+        keyword: str,
+        image2: dict[str, PIL.Image.Image],
+        keyword2: str,
+        text: str,
+        seed: int,
+    ):
         sub_imgs = [image["composite"]]
         sun_keywords = [keyword]
+        if keyword2 and keyword2 != "no subject":
             sun_keywords.append(keyword2)
+            if image2:
+                sub_imgs.append(image2["composite"])
         raw_data = {
             "prompt": text,
             "subject_images": sub_imgs,
+            "subject_keywords": sun_keywords,
         }
+        image = self.inference(raw_data, seed)
         return image
+def create_demo():
+    USAGE = """## To run the demo, you should:
     1. Upload your image.
     2. <span style='color: red;'>**Upload a masked subject image with white blankspace or whiten out manually using brush tool.**
     3. Input a Keyword i.e. 'Dog'
      4-2. Input the Keyword i.e. 'Sunglasses'
     3. Input proper text prompts, such as "A photo of Dog" or "A Dog wearing sunglasses", Please use the same keyword in the prompt.
     4. Click the Run button.
+    """
     model = Model()
             <p style="text-align: center; color: red;">This demo is currently hosted on either a small GPU or CPU. We will soon provide high-end GPU support.</p>
             <p style="text-align: center; color: red;">Please follow the instructions from here to run it locally: <a href="https://github.com/eclipse-t2i/lambda-eclipse-inference">GitHub Inference Code</a></p>
+            <a href="https://colab.research.google.com/drive/1VcqzXZmilntec3AsIyzCqlstEhX4Pa1o?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
             """
         )
         gr.Markdown(USAGE)
             with gr.Column():
                 with gr.Group():
                     gr.Markdown(
+                        "Upload your first masked subject image or mask out marginal space"
+                    )
+                    image = gr.ImageEditor(
+                        label="Input",
+                        type="pil",
+                        brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"),
+                    )
                     keyword = gr.Text(
+                        label="Keyword",
                         placeholder='e.g. "Dog", "Goofie"',
+                        info="Keyword for first subject",
+                    )
                     gr.Markdown(
+                        "For Multi-Subject generation : Upload your second masked subject image or mask out marginal space"
+                    )
+                    image2 = gr.ImageEditor(
+                        label="Input",
+                        type="pil",
+                        brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"),
+                    )
+                    keyword2 = gr.Text(
+                        label="Keyword",
                         placeholder='e.g. "Sunglasses", "Grand Canyon"',
+                        info="Keyword for second subject",
+                    )
                     prompt = gr.Text(
+                        label="Prompt",
                         placeholder='e.g. "A photo of dog", "A dog wearing sunglasses"',
+                        info="Keep the keywords used previously in the prompt",
+                    )
+                run_button = gr.Button("Run")
             with gr.Column():
+                result = gr.Image(label="Result")
         inputs = [
             image,
             keyword2,
             prompt,
         ]
         gr.Examples(
+            examples=[
+                [
+                    os.path.join(os.path.dirname(__file__), "./assets/luffy.jpg"),
+                    "luffy",
+                    os.path.join(os.path.dirname(__file__), "./assets/white.jpg"),
+                    "no subject",
+                    "luffy holding a sword",
+                ],
+                [
+                    os.path.join(os.path.dirname(__file__), "./assets/luffy.jpg"),
+                    "luffy",
+                    os.path.join(os.path.dirname(__file__), "./assets/white.jpg"),
+                    "no subject",
+                    "luffy in the living room",
+                ],
+                [
+                    os.path.join(os.path.dirname(__file__), "./assets/teapot.jpg"),
+                    "teapot",
+                    os.path.join(os.path.dirname(__file__), "./assets/white.jpg"),
+                    "no subject",
+                    "teapot on a cobblestone street",
+                ],
+                [
+                    os.path.join(os.path.dirname(__file__), "./assets/trex.jpg"),
+                    "trex",
+                    os.path.join(os.path.dirname(__file__), "./assets/white.jpg"),
+                    "no subject",
+                    "trex near a river",
+                ],
+                [
+                    os.path.join(os.path.dirname(__file__), "./assets/cat.png"),
+                    "cat",
+                    os.path.join(
+                        os.path.dirname(__file__), "./assets/blue_sunglasses.png"
+                    ),
+                    "glasses",
+                    "A cat wearing glasses on a snowy field",
+                ],
+                [
+                    os.path.join(os.path.dirname(__file__), "./assets/statue.jpg"),
+                    "statue",
+                    os.path.join(os.path.dirname(__file__), "./assets/toilet.jpg"),
+                    "toilet",
+                    "statue sitting on a toilet",
+                ],
+                [
+                    os.path.join(os.path.dirname(__file__), "./assets/teddy.jpg"),
+                    "teddy",
+                    os.path.join(os.path.dirname(__file__), "./assets/luffy_hat.jpg"),
+                    "hat",
+                    "a teddy wearing the hat at a beach",
+                ],
+                [
+                    os.path.join(os.path.dirname(__file__), "./assets/chair.jpg"),
+                    "chair",
+                    os.path.join(os.path.dirname(__file__), "./assets/table.jpg"),
+                    "table",
+                    "a chair and table in living room",
+                ],
+            ],
+            inputs=inputs,
             fn=model.run,
             outputs=result,
         )
         run_button.click(fn=model.run, inputs=inputs, outputs=result)
     return demo
+if __name__ == "__main__":
     demo = create_demo()
+    demo.queue(max_size=20).launch()