Commit
•
2e8ba46
1
Parent(s):
e7c16cf
Upload processor
Browse files- preprocessor_config.json +3 -0
- processor.py +43 -0
- tokenizer_config.json +3 -0
preprocessor_config.json
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
{
|
|
|
|
|
|
|
2 |
"crop_size": {
|
3 |
"height": 224,
|
4 |
"width": 224
|
|
|
1 |
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoProcessor": "processor.GIAProcessor"
|
4 |
+
},
|
5 |
"crop_size": {
|
6 |
"height": 224,
|
7 |
"width": 224
|
processor.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from itertools import chain
|
2 |
+
from transformers import GitProcessor
|
3 |
+
|
4 |
+
class GIAProcessor(GitProcessor):
|
5 |
+
def __init__(self, image_processor, tokenizer):
|
6 |
+
super().__init__(image_processor, tokenizer)
|
7 |
+
self._block_size = 1024
|
8 |
+
|
9 |
+
def _group_texts(self, examples):
|
10 |
+
# Concatenate all texts.
|
11 |
+
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
12 |
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
13 |
+
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
|
14 |
+
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
15 |
+
total_length = (total_length // self._block_size) * self._block_size
|
16 |
+
# Split by chunks of max_len.
|
17 |
+
result = {
|
18 |
+
k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
|
19 |
+
for k, t in concatenated_examples.items()
|
20 |
+
}
|
21 |
+
return result
|
22 |
+
|
23 |
+
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
24 |
+
if text is not None and images is None:
|
25 |
+
encoded_text = self.tokenizer(text, return_tensors=return_tensors)
|
26 |
+
encoding = self._group_texts(encoded_text)
|
27 |
+
elif text is not None and images is not None:
|
28 |
+
encoding = super().__call__(text, images, return_tensors, **kwargs)
|
29 |
+
|
30 |
+
return encoding
|
31 |
+
|
32 |
+
def batch_decode(self, *args, **kwargs):
|
33 |
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
34 |
+
|
35 |
+
def decode(self, *args, **kwargs):
|
36 |
+
return self.tokenizer.decode(*args, **kwargs)
|
37 |
+
|
38 |
+
@property
|
39 |
+
def model_input_names(self):
|
40 |
+
return ["input_ids", "attention_mask", "pixel_values"]
|
41 |
+
|
42 |
+
|
43 |
+
GIAProcessor.register_for_auto_class("AutoProcessor")
|
tokenizer_config.json
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
{
|
|
|
|
|
|
|
2 |
"clean_up_tokenization_spaces": true,
|
3 |
"cls_token": "[CLS]",
|
4 |
"do_lower_case": true,
|
|
|
1 |
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoProcessor": "processor.GIAProcessor"
|
4 |
+
},
|
5 |
"clean_up_tokenization_spaces": true,
|
6 |
"cls_token": "[CLS]",
|
7 |
"do_lower_case": true,
|