Spaces:

bilgeyucel
/

captionate

Running

App Files Files Community

bilgeyucel commited on Feb 23, 2024

Commit

102b698

verified ·

1 Parent(s): e5db11f

Update image_captioner.py (#2)

Browse files

- Update image_captioner.py (4397b1ed3f8f85548ec132c4ce448a382a0925a4)
- Update app.py (40673d5c8acf776974d6f638fd09d900f373eac2)

Files changed (2) hide show

app.py +6 -6
image_captioner.py +8 -16

app.py CHANGED Viewed

@@ -24,28 +24,28 @@ prompt_template = """
 You will receive a descriptive text of a photo.
 Try to generate a nice Instagram caption with a phrase rhyming with the text. Include emojis in the caption.
-Descriptive text: {{captions[0]}};
 Instagram Caption:
 """
 hf_api_key = os.environ["HF_API_KEY"]
-def generate_caption(image_file_paths, model_name):
     image_to_text = ImageCaptioner(
         model_name="nlpconnect/vit-gpt2-image-captioning",
     )
     prompt_builder = PromptBuilder(template=prompt_template)
-    generator = HuggingFaceTGIGenerator(model=model_name, token=Secret.from_token(hf_api_key))
     captioning_pipeline = Pipeline()
     captioning_pipeline.add_component("image_to_text", image_to_text)
     captioning_pipeline.add_component("prompt_builder", prompt_builder)
     captioning_pipeline.add_component("generator", generator)
-    captioning_pipeline.connect("image_to_text.captions", "prompt_builder.captions")
     captioning_pipeline.connect("prompt_builder", "generator")
-    result = captioning_pipeline.run({"image_to_text":{"image_file_paths":image_file_paths}})
-    return result["generator"][0]
 with gr.Blocks(theme="soft") as demo:
     gr.Markdown(value=description)

 You will receive a descriptive text of a photo.
 Try to generate a nice Instagram caption with a phrase rhyming with the text. Include emojis in the caption.
+Descriptive text: {{caption}};
 Instagram Caption:
 """
 hf_api_key = os.environ["HF_API_KEY"]
+def generate_caption(image_file_path, model_name):
     image_to_text = ImageCaptioner(
         model_name="nlpconnect/vit-gpt2-image-captioning",
     )
     prompt_builder = PromptBuilder(template=prompt_template)
+    generator = HuggingFaceTGIGenerator(model=model_name, token=Secret.from_token(hf_api_key), generation_kwargs={"max_new_tokens":50})
     captioning_pipeline = Pipeline()
     captioning_pipeline.add_component("image_to_text", image_to_text)
     captioning_pipeline.add_component("prompt_builder", prompt_builder)
     captioning_pipeline.add_component("generator", generator)
+    captioning_pipeline.connect("image_to_text.caption", "prompt_builder.caption")
     captioning_pipeline.connect("prompt_builder", "generator")
+    result = captioning_pipeline.run({"image_to_text":{"image_file_path":image_file_path}})
+    return result["generator"]["replies"][0]
 with gr.Blocks(theme="soft") as demo:
     gr.Markdown(value=description)

image_captioner.py CHANGED Viewed

@@ -37,20 +37,16 @@ class ImageCaptioner:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
-    @component.output_types(captions=List[str])
-    def run(self, image_file_paths: List[str]) -> List[Document]:
-        images = []
-        for image_path in image_file_paths:
-            i_image = Image.open(image_path)
-            if i_image.mode != "RGB":
-                i_image = i_image.convert(mode="RGB")
-            images.append(i_image)
         preds = []
         if self.model_name == "nlpconnect/vit-gpt2-image-captioning":
-            pixel_values = self.feature_extractor(images=images, return_tensors="pt").pixel_values
             pixel_values = pixel_values.to(self.device)
             output_ids = self.model.generate(pixel_values, **self.gen_kwargs)
@@ -59,7 +55,7 @@ class ImageCaptioner:
             preds = [pred.strip() for pred in preds]
         else:
-            inputs = self.processor(images, return_tensors="pt")
             output_ids = self.model.generate(**inputs)
             preds = self.processor.batch_decode(output_ids, skip_special_tokens=True)
             preds = [pred.strip() for pred in preds]
@@ -68,8 +64,4 @@ class ImageCaptioner:
         # for caption, image_file_path in zip(preds, image_file_paths):
         #     document = Document(content=caption, meta={"image_path": image_file_path})
         #     captions.append(document)
-        return {"captions": preds}
-# captioner = ImageCaptioner(model_name="Salesforce/blip-image-captioning-base")
-# result = captioner.run(image_file_paths=["selfie.png"])
-# print(result)

         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
+    @component.output_types(caption=str)
+    def run(self, image_file_path: str) -> List[Document]:
+        i_image = Image.open(image_file_path)
+        if i_image.mode != "RGB":
+            i_image = i_image.convert(mode="RGB")
         preds = []
         if self.model_name == "nlpconnect/vit-gpt2-image-captioning":
+            pixel_values = self.feature_extractor(images=[i_image], return_tensors="pt").pixel_values
             pixel_values = pixel_values.to(self.device)
             output_ids = self.model.generate(pixel_values, **self.gen_kwargs)
             preds = [pred.strip() for pred in preds]
         else:
+            inputs = self.processor([i_image], return_tensors="pt")
             output_ids = self.model.generate(**inputs)
             preds = self.processor.batch_decode(output_ids, skip_special_tokens=True)
             preds = [pred.strip() for pred in preds]
         # for caption, image_file_path in zip(preds, image_file_paths):
         #     document = Document(content=caption, meta={"image_path": image_file_path})
         #     captions.append(document)
+        return {"caption": preds[0]}