kimsan0622
commited on
Commit
•
f68803e
1
Parent(s):
db26b82
Upload processor
Browse files- preprocessor_config.json +7 -2
- processing_veld.py +4 -4
- special_tokens_map.json +49 -105
- spiece.model +2 -2
- tokenizer.json +0 -0
- tokenizer_config.json +60 -110
preprocessor_config.json
CHANGED
@@ -3,13 +3,14 @@
|
|
3 |
"AutoProcessor": "processing_veld.VELDProcessor"
|
4 |
},
|
5 |
"do_normalize": true,
|
|
|
6 |
"do_resize": true,
|
7 |
-
"feature_extractor_type": "ViTFeatureExtractor",
|
8 |
"image_mean": [
|
9 |
0.5,
|
10 |
0.5,
|
11 |
0.5
|
12 |
],
|
|
|
13 |
"image_std": [
|
14 |
0.5,
|
15 |
0.5,
|
@@ -17,5 +18,9 @@
|
|
17 |
],
|
18 |
"processor_class": "VELDProcessor",
|
19 |
"resample": 2,
|
20 |
-
"
|
|
|
|
|
|
|
|
|
21 |
}
|
|
|
3 |
"AutoProcessor": "processing_veld.VELDProcessor"
|
4 |
},
|
5 |
"do_normalize": true,
|
6 |
+
"do_rescale": true,
|
7 |
"do_resize": true,
|
|
|
8 |
"image_mean": [
|
9 |
0.5,
|
10 |
0.5,
|
11 |
0.5
|
12 |
],
|
13 |
+
"image_processor_type": "ViTImageProcessor",
|
14 |
"image_std": [
|
15 |
0.5,
|
16 |
0.5,
|
|
|
18 |
],
|
19 |
"processor_class": "VELDProcessor",
|
20 |
"resample": 2,
|
21 |
+
"rescale_factor": 0.00392156862745098,
|
22 |
+
"size": {
|
23 |
+
"height": 384,
|
24 |
+
"width": 384
|
25 |
+
}
|
26 |
}
|
processing_veld.py
CHANGED
@@ -24,16 +24,16 @@ class VELDProcessor(ProcessorMixin):
|
|
24 |
r"""
|
25 |
Constructs a VELD processor which wraps a vision feature extractor and a tokenizer into a single
|
26 |
processor.
|
27 |
-
[`VELDProcessor`] offers all the functionalities of [`
|
28 |
[`AutoTokenizer`]. See the [`~VELDProcessor.__call__`] and
|
29 |
[`~VELDProcessor.decode`] for more information.
|
30 |
Args:
|
31 |
-
feature_extractor ([`
|
32 |
The feature extractor is a required input.
|
33 |
tokenizer ([`PreTrainedTokenizer`]):
|
34 |
The tokenizer is a required input.
|
35 |
"""
|
36 |
-
feature_extractor_class = "
|
37 |
tokenizer_class = "AutoTokenizer"
|
38 |
|
39 |
def __init__(self, feature_extractor, tokenizer):
|
@@ -45,7 +45,7 @@ class VELDProcessor(ProcessorMixin):
|
|
45 |
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
46 |
and `kwargs` arguments to VisionTextDualEncoderTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
|
47 |
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
48 |
-
|
49 |
doctsring of the above two methods for more information.
|
50 |
Args:
|
51 |
text (`str`, `List[str]`, `List[List[str]]`):
|
|
|
24 |
r"""
|
25 |
Constructs a VELD processor which wraps a vision feature extractor and a tokenizer into a single
|
26 |
processor.
|
27 |
+
[`VELDProcessor`] offers all the functionalities of [`AutoImageProcessor`] and
|
28 |
[`AutoTokenizer`]. See the [`~VELDProcessor.__call__`] and
|
29 |
[`~VELDProcessor.decode`] for more information.
|
30 |
Args:
|
31 |
+
feature_extractor ([`AutoImageProcessor`]):
|
32 |
The feature extractor is a required input.
|
33 |
tokenizer ([`PreTrainedTokenizer`]):
|
34 |
The tokenizer is a required input.
|
35 |
"""
|
36 |
+
feature_extractor_class = "AutoImageProcessor"
|
37 |
tokenizer_class = "AutoTokenizer"
|
38 |
|
39 |
def __init__(self, feature_extractor, tokenizer):
|
|
|
45 |
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
46 |
and `kwargs` arguments to VisionTextDualEncoderTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
|
47 |
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
48 |
+
AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the
|
49 |
doctsring of the above two methods for more information.
|
50 |
Args:
|
51 |
text (`str`, `List[str]`, `List[List[str]]`):
|
special_tokens_map.json
CHANGED
@@ -1,107 +1,51 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
-
"<
|
4 |
-
"
|
5 |
-
"
|
6 |
-
"
|
7 |
-
"
|
8 |
-
|
9 |
-
|
10 |
-
"
|
11 |
-
"
|
12 |
-
"
|
13 |
-
"
|
14 |
-
"
|
15 |
-
|
16 |
-
|
17 |
-
"
|
18 |
-
"
|
19 |
-
"
|
20 |
-
"
|
21 |
-
"
|
22 |
-
|
23 |
-
|
24 |
-
"
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"
|
28 |
-
"
|
29 |
-
|
30 |
-
|
31 |
-
"<
|
32 |
-
"
|
33 |
-
"
|
34 |
-
"
|
35 |
-
"
|
36 |
-
|
37 |
-
|
38 |
-
"
|
39 |
-
"
|
40 |
-
"
|
41 |
-
"
|
42 |
-
"
|
43 |
-
|
44 |
-
|
45 |
-
"<
|
46 |
-
"
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"
|
50 |
-
|
51 |
-
"<extra_id_48>",
|
52 |
-
"<extra_id_49>",
|
53 |
-
"<extra_id_50>",
|
54 |
-
"<extra_id_51>",
|
55 |
-
"<extra_id_52>",
|
56 |
-
"<extra_id_53>",
|
57 |
-
"<extra_id_54>",
|
58 |
-
"<extra_id_55>",
|
59 |
-
"<extra_id_56>",
|
60 |
-
"<extra_id_57>",
|
61 |
-
"<extra_id_58>",
|
62 |
-
"<extra_id_59>",
|
63 |
-
"<extra_id_60>",
|
64 |
-
"<extra_id_61>",
|
65 |
-
"<extra_id_62>",
|
66 |
-
"<extra_id_63>",
|
67 |
-
"<extra_id_64>",
|
68 |
-
"<extra_id_65>",
|
69 |
-
"<extra_id_66>",
|
70 |
-
"<extra_id_67>",
|
71 |
-
"<extra_id_68>",
|
72 |
-
"<extra_id_69>",
|
73 |
-
"<extra_id_70>",
|
74 |
-
"<extra_id_71>",
|
75 |
-
"<extra_id_72>",
|
76 |
-
"<extra_id_73>",
|
77 |
-
"<extra_id_74>",
|
78 |
-
"<extra_id_75>",
|
79 |
-
"<extra_id_76>",
|
80 |
-
"<extra_id_77>",
|
81 |
-
"<extra_id_78>",
|
82 |
-
"<extra_id_79>",
|
83 |
-
"<extra_id_80>",
|
84 |
-
"<extra_id_81>",
|
85 |
-
"<extra_id_82>",
|
86 |
-
"<extra_id_83>",
|
87 |
-
"<extra_id_84>",
|
88 |
-
"<extra_id_85>",
|
89 |
-
"<extra_id_86>",
|
90 |
-
"<extra_id_87>",
|
91 |
-
"<extra_id_88>",
|
92 |
-
"<extra_id_89>",
|
93 |
-
"<extra_id_90>",
|
94 |
-
"<extra_id_91>",
|
95 |
-
"<extra_id_92>",
|
96 |
-
"<extra_id_93>",
|
97 |
-
"<extra_id_94>",
|
98 |
-
"<extra_id_95>",
|
99 |
-
"<extra_id_96>",
|
100 |
-
"<extra_id_97>",
|
101 |
-
"<extra_id_98>",
|
102 |
-
"<extra_id_99>"
|
103 |
-
],
|
104 |
-
"eos_token": "</s>",
|
105 |
-
"pad_token": "<pad>",
|
106 |
-
"unk_token": "<unk>"
|
107 |
}
|
|
|
1 |
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "[CLS]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": true,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "[MASK]",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": true,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "[SEP]",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": true,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": true,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
}
|
spiece.model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4368e6b7901e8c54bd62326ca5a5063eba36c31d74995a599a5be77a0cd5cfd0
|
3 |
+
size 1592581
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -1,117 +1,67 @@
|
|
1 |
{
|
2 |
-
"additional_special_tokens": [
|
3 |
-
"<extra_id_0>",
|
4 |
-
"<extra_id_1>",
|
5 |
-
"<extra_id_2>",
|
6 |
-
"<extra_id_3>",
|
7 |
-
"<extra_id_4>",
|
8 |
-
"<extra_id_5>",
|
9 |
-
"<extra_id_6>",
|
10 |
-
"<extra_id_7>",
|
11 |
-
"<extra_id_8>",
|
12 |
-
"<extra_id_9>",
|
13 |
-
"<extra_id_10>",
|
14 |
-
"<extra_id_11>",
|
15 |
-
"<extra_id_12>",
|
16 |
-
"<extra_id_13>",
|
17 |
-
"<extra_id_14>",
|
18 |
-
"<extra_id_15>",
|
19 |
-
"<extra_id_16>",
|
20 |
-
"<extra_id_17>",
|
21 |
-
"<extra_id_18>",
|
22 |
-
"<extra_id_19>",
|
23 |
-
"<extra_id_20>",
|
24 |
-
"<extra_id_21>",
|
25 |
-
"<extra_id_22>",
|
26 |
-
"<extra_id_23>",
|
27 |
-
"<extra_id_24>",
|
28 |
-
"<extra_id_25>",
|
29 |
-
"<extra_id_26>",
|
30 |
-
"<extra_id_27>",
|
31 |
-
"<extra_id_28>",
|
32 |
-
"<extra_id_29>",
|
33 |
-
"<extra_id_30>",
|
34 |
-
"<extra_id_31>",
|
35 |
-
"<extra_id_32>",
|
36 |
-
"<extra_id_33>",
|
37 |
-
"<extra_id_34>",
|
38 |
-
"<extra_id_35>",
|
39 |
-
"<extra_id_36>",
|
40 |
-
"<extra_id_37>",
|
41 |
-
"<extra_id_38>",
|
42 |
-
"<extra_id_39>",
|
43 |
-
"<extra_id_40>",
|
44 |
-
"<extra_id_41>",
|
45 |
-
"<extra_id_42>",
|
46 |
-
"<extra_id_43>",
|
47 |
-
"<extra_id_44>",
|
48 |
-
"<extra_id_45>",
|
49 |
-
"<extra_id_46>",
|
50 |
-
"<extra_id_47>",
|
51 |
-
"<extra_id_48>",
|
52 |
-
"<extra_id_49>",
|
53 |
-
"<extra_id_50>",
|
54 |
-
"<extra_id_51>",
|
55 |
-
"<extra_id_52>",
|
56 |
-
"<extra_id_53>",
|
57 |
-
"<extra_id_54>",
|
58 |
-
"<extra_id_55>",
|
59 |
-
"<extra_id_56>",
|
60 |
-
"<extra_id_57>",
|
61 |
-
"<extra_id_58>",
|
62 |
-
"<extra_id_59>",
|
63 |
-
"<extra_id_60>",
|
64 |
-
"<extra_id_61>",
|
65 |
-
"<extra_id_62>",
|
66 |
-
"<extra_id_63>",
|
67 |
-
"<extra_id_64>",
|
68 |
-
"<extra_id_65>",
|
69 |
-
"<extra_id_66>",
|
70 |
-
"<extra_id_67>",
|
71 |
-
"<extra_id_68>",
|
72 |
-
"<extra_id_69>",
|
73 |
-
"<extra_id_70>",
|
74 |
-
"<extra_id_71>",
|
75 |
-
"<extra_id_72>",
|
76 |
-
"<extra_id_73>",
|
77 |
-
"<extra_id_74>",
|
78 |
-
"<extra_id_75>",
|
79 |
-
"<extra_id_76>",
|
80 |
-
"<extra_id_77>",
|
81 |
-
"<extra_id_78>",
|
82 |
-
"<extra_id_79>",
|
83 |
-
"<extra_id_80>",
|
84 |
-
"<extra_id_81>",
|
85 |
-
"<extra_id_82>",
|
86 |
-
"<extra_id_83>",
|
87 |
-
"<extra_id_84>",
|
88 |
-
"<extra_id_85>",
|
89 |
-
"<extra_id_86>",
|
90 |
-
"<extra_id_87>",
|
91 |
-
"<extra_id_88>",
|
92 |
-
"<extra_id_89>",
|
93 |
-
"<extra_id_90>",
|
94 |
-
"<extra_id_91>",
|
95 |
-
"<extra_id_92>",
|
96 |
-
"<extra_id_93>",
|
97 |
-
"<extra_id_94>",
|
98 |
-
"<extra_id_95>",
|
99 |
-
"<extra_id_96>",
|
100 |
-
"<extra_id_97>",
|
101 |
-
"<extra_id_98>",
|
102 |
-
"<extra_id_99>"
|
103 |
-
],
|
104 |
"auto_map": {
|
105 |
"AutoProcessor": "processing_veld.VELDProcessor"
|
106 |
},
|
107 |
-
"
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
"processor_class": "VELDProcessor",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
"sp_model_kwargs": {},
|
114 |
-
"special_tokens_map_file":
|
115 |
-
"tokenizer_class": "
|
116 |
-
"unk_token":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
}
|
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"auto_map": {
|
3 |
"AutoProcessor": "processing_veld.VELDProcessor"
|
4 |
},
|
5 |
+
"bos_token": {
|
6 |
+
"__type": "AddedToken",
|
7 |
+
"content": "<s>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": true,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false
|
12 |
+
},
|
13 |
+
"cls_token": {
|
14 |
+
"__type": "AddedToken",
|
15 |
+
"content": "[CLS]",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": true,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false
|
20 |
+
},
|
21 |
+
"eos_token": {
|
22 |
+
"__type": "AddedToken",
|
23 |
+
"content": "</s>",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": true,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false
|
28 |
+
},
|
29 |
+
"mask_token": {
|
30 |
+
"__type": "AddedToken",
|
31 |
+
"content": "[MASK]",
|
32 |
+
"lstrip": true,
|
33 |
+
"normalized": true,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"model_max_length": 1000000000000000019884624838656,
|
38 |
+
"name_or_path": "veld_e0_linear",
|
39 |
+
"pad_token": {
|
40 |
+
"__type": "AddedToken",
|
41 |
+
"content": "<pad>",
|
42 |
+
"lstrip": false,
|
43 |
+
"normalized": true,
|
44 |
+
"rstrip": false,
|
45 |
+
"single_word": false
|
46 |
+
},
|
47 |
"processor_class": "VELDProcessor",
|
48 |
+
"sep_token": {
|
49 |
+
"__type": "AddedToken",
|
50 |
+
"content": "[SEP]",
|
51 |
+
"lstrip": false,
|
52 |
+
"normalized": true,
|
53 |
+
"rstrip": false,
|
54 |
+
"single_word": false
|
55 |
+
},
|
56 |
"sp_model_kwargs": {},
|
57 |
+
"special_tokens_map_file": "vocab/ko_en/spiece/ko20000vs64000_ext/special_tokens_map.json",
|
58 |
+
"tokenizer_class": "BigBirdTokenizer",
|
59 |
+
"unk_token": {
|
60 |
+
"__type": "AddedToken",
|
61 |
+
"content": "<unk>",
|
62 |
+
"lstrip": false,
|
63 |
+
"normalized": true,
|
64 |
+
"rstrip": false,
|
65 |
+
"single_word": false
|
66 |
+
}
|
67 |
}
|