adjust image processing for batch output
#63
by
HwwwH
- opened
- config.json +1 -0
- image_processing_minicpmv.py +1 -1
- processing_minicpmv.py +9 -12
config.json
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
{
|
2 |
"_name_or_path": "openbmb/MiniCPM-Llama3-V-2_5",
|
|
|
3 |
"architectures": [
|
4 |
"MiniCPMV"
|
5 |
],
|
|
|
1 |
{
|
2 |
"_name_or_path": "openbmb/MiniCPM-Llama3-V-2_5",
|
3 |
+
"version": "2.5",
|
4 |
"architectures": [
|
5 |
"MiniCPMV"
|
6 |
],
|
image_processing_minicpmv.py
CHANGED
@@ -396,7 +396,7 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
|
|
396 |
if tgt_sizes:
|
397 |
tgt_sizes = np.vstack(tgt_sizes)
|
398 |
return MiniCPMVBatchFeature(
|
399 |
-
data={"pixel_values": new_images, "image_sizes": image_sizes, "tgt_sizes": tgt_sizes}, tensor_type=return_tensors
|
400 |
)
|
401 |
|
402 |
AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
|
|
|
396 |
if tgt_sizes:
|
397 |
tgt_sizes = np.vstack(tgt_sizes)
|
398 |
return MiniCPMVBatchFeature(
|
399 |
+
data={"pixel_values": [new_images], "image_sizes": [image_sizes], "tgt_sizes": [tgt_sizes]}, tensor_type=return_tensors
|
400 |
)
|
401 |
|
402 |
AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
|
processing_minicpmv.py
CHANGED
@@ -61,14 +61,10 @@ class MiniCPMVProcessor(ProcessorMixin):
|
|
61 |
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
62 |
) -> MiniCPMVBatchFeature:
|
63 |
"""
|
64 |
-
|
65 |
-
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
66 |
-
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
67 |
-
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
68 |
-
of the above two methods for more information.
|
69 |
|
70 |
Args:
|
71 |
-
text (`str
|
72 |
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
73 |
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
74 |
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
@@ -176,19 +172,19 @@ class MiniCPMVProcessor(ProcessorMixin):
|
|
176 |
images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
|
177 |
|
178 |
image_tags = re.findall(pattern, texts)
|
179 |
-
assert len(image_tags) == len(image_sizes)
|
180 |
text_chunks = texts.split(pattern)
|
181 |
final_texts = ""
|
182 |
for i in range(len(image_tags)):
|
183 |
-
final_texts = final_texts + text_chunks[i] + self.image_processor.get_slice_image_placeholder(image_sizes[i])
|
184 |
final_texts += text_chunks[-1]
|
185 |
input_ids, image_bounds = self._convert(final_texts, max_length)
|
186 |
return MiniCPMVBatchFeature(data={
|
187 |
"input_ids": input_ids,
|
188 |
-
"pixel_values":
|
189 |
-
"image_sizes":
|
190 |
"image_bound": [image_bounds],
|
191 |
-
"tgt_sizes":
|
192 |
})
|
193 |
|
194 |
@property
|
@@ -244,4 +240,5 @@ class MiniCPMVProcessor(ProcessorMixin):
|
|
244 |
else:
|
245 |
tensor[i, : len(item[key][0]), :] = item[key][0].clone()
|
246 |
|
247 |
-
return tensor
|
|
|
|
61 |
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
62 |
) -> MiniCPMVBatchFeature:
|
63 |
"""
|
64 |
+
Only support for single input for now. Batched input is coming soon.
|
|
|
|
|
|
|
|
|
65 |
|
66 |
Args:
|
67 |
+
text (`str`):
|
68 |
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
69 |
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
70 |
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
|
172 |
images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
|
173 |
|
174 |
image_tags = re.findall(pattern, texts)
|
175 |
+
assert len(image_tags) == len(image_sizes[0])
|
176 |
text_chunks = texts.split(pattern)
|
177 |
final_texts = ""
|
178 |
for i in range(len(image_tags)):
|
179 |
+
final_texts = final_texts + text_chunks[i] + self.image_processor.get_slice_image_placeholder(image_sizes[0][i])
|
180 |
final_texts += text_chunks[-1]
|
181 |
input_ids, image_bounds = self._convert(final_texts, max_length)
|
182 |
return MiniCPMVBatchFeature(data={
|
183 |
"input_ids": input_ids,
|
184 |
+
"pixel_values": images,
|
185 |
+
"image_sizes": image_sizes,
|
186 |
"image_bound": [image_bounds],
|
187 |
+
"tgt_sizes": tgt_sizes
|
188 |
})
|
189 |
|
190 |
@property
|
|
|
240 |
else:
|
241 |
tensor[i, : len(item[key][0]), :] = item[key][0].clone()
|
242 |
|
243 |
+
return tensor
|
244 |
+
|