BAAI
/

Emu3-Chat

ryanzhangfan commited on Oct 17

Commit

bf2ce69

•

1 Parent(s): e7d43f2

add support for batch image generation

Files changed (2) hide show

processing_emu3.py CHANGED Viewed

@@ -84,7 +84,7 @@ class Emu3Processor(ProcessorMixin):
         image: Optional[Image.Image | List[Image.Image]] = None,
         *,
         mode: str = "G",
-        ratio: str = "1:1",
         image_area: int = 518400,
         **kwargs,
     ) -> BatchFeature:
@@ -129,8 +129,11 @@ class Emu3Processor(ProcessorMixin):
             if image is not None:
                 raise ValueError("You have to specify only `text` in generation mode")
-            if len(text) > 1:
-                raise ValueError("`text` can only be `str` in generation mode")
         else:
             if image is None:
                 raise ValueError("Invalid input image. Please provide exactly one PIL.Image.Image per text.")
@@ -165,7 +168,7 @@ class Emu3Processor(ProcessorMixin):
                 )
                 prompt += self.chat_template.format(image_prompt=image_prompt, text_prompt=text_prompt)
             else:
-                h, w = self.calculate_generate_size(ratio, image_area, self.vision_tokenizer.spatial_scale_factor)
                 image_prompt = (
                     self.tokenizer.boi_token +
                     self.prefix_template.format(H=h, W=w) +

         image: Optional[Image.Image | List[Image.Image]] = None,
         *,
         mode: str = "G",
+        ratio: str | List[str] = "1:1",
         image_area: int = 518400,
         **kwargs,
     ) -> BatchFeature:
             if image is not None:
                 raise ValueError("You have to specify only `text` in generation mode")
+            if isinstance(ratio, str):
+                ratio = [ratio] * len(text)
+            if len(ratio) != len(text):
+                raise ValueError("ratio number must match text number")
         else:
             if image is None:
                 raise ValueError("Invalid input image. Please provide exactly one PIL.Image.Image per text.")
                 )
                 prompt += self.chat_template.format(image_prompt=image_prompt, text_prompt=text_prompt)
             else:
+                h, w = self.calculate_generate_size(ratio[idx], image_area, self.vision_tokenizer.spatial_scale_factor)
                 image_prompt = (
                     self.tokenizer.boi_token +
                     self.prefix_template.format(H=h, W=w) +

utils_emu3.py CHANGED Viewed

@@ -47,16 +47,22 @@ class Emu3PrefixConstrainedLogitsHelper:
             position = torch.nonzero(input_ids == self.img_token, as_tuple=True)[0][0]
             self.offset_cache[batch_id] = position
         offset = input_ids.shape[0] - self.offset_cache[batch_id]
-        if offset % (self.width + 1) == 0:
             return (self.eol_token, )
-        elif offset == (self.width + 1) * self.height + 1:
             return (self.eof_token, )
-        elif offset == (self.width + 1) * self.height + 2:
             return (self.eoi_token, )
-        elif offset == (self.width + 1) * self.height + 3:
             return (self.eos_token, )
-        elif offset > (self.width + 1) * self.height + 3:
             return (self.pad_token, )
         else:
             return self.visual_tokens

             position = torch.nonzero(input_ids == self.img_token, as_tuple=True)[0][0]
             self.offset_cache[batch_id] = position
+        height = self.height[batch_id] if self.height.shape[0] > 1 else self.height[0]
+        width = self.width[batch_id] if self.width.shape[0] > 1 else self.width[0]
         offset = input_ids.shape[0] - self.offset_cache[batch_id]
+        height = height.to(offset.device)
+        width = width.to(offset.device)
+        if offset % (width + 1) == 0:
             return (self.eol_token, )
+        elif offset == (width + 1) * height + 1:
             return (self.eof_token, )
+        elif offset == (width + 1) * height + 2:
             return (self.eoi_token, )
+        elif offset == (width + 1) * height + 3:
             return (self.eos_token, )
+        elif offset > (width + 1) * height + 3:
             return (self.pad_token, )
         else:
             return self.visual_tokens