Mizukiluke commited on
Commit
6da9385
1 Parent(s): 421b2e2

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "mplug_owl2_1",
3
+ "multiway": true,
4
+ "attn_dropout_prob": 0.0,
5
+ "bf16": false,
6
+ "emb_dropout_prob": 0.0,
7
+ "fp16": false,
8
+ "fp32": false,
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 22016,
12
+ "kv_channels": 128,
13
+ "layer_norm_epsilon": 1e-06,
14
+ "max_position_embeddings": 8192,
15
+ "no_bias": true,
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 32,
18
+ "onnx_safe": null,
19
+ "rotary_emb_base": 10000,
20
+ "rotary_pct": 1.0,
21
+ "scale_attn_weights": true,
22
+ "seq_length": 2048,
23
+ "tie_word_embeddings": false,
24
+ "tokenizer_type": "QWenTokenizer",
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.31.0",
27
+ "use_cache": true,
28
+ "use_dynamic_ntk": true,
29
+ "use_flash_attn": false,
30
+ "use_logn_attn": true,
31
+ "visual_config": {
32
+ "visual_abstractor": {
33
+ "_name_or_path": "",
34
+ "use_cls_token": false,
35
+ "add_cross_attention": false,
36
+ "architectures": null,
37
+ "attention_probs_dropout_prob": 0.0,
38
+ "bad_words_ids": null,
39
+ "begin_suppress_tokens": null,
40
+ "bos_token_id": null,
41
+ "chunk_size_feed_forward": 0,
42
+ "cross_attention_hidden_size": null,
43
+ "decoder_start_token_id": null,
44
+ "diversity_penalty": 0.0,
45
+ "do_sample": false,
46
+ "early_stopping": false,
47
+ "encoder_hidden_size": 1664,
48
+ "encoder_no_repeat_ngram_size": 0,
49
+ "eos_token_id": null,
50
+ "exponential_decay_length_penalty": null,
51
+ "finetuning_task": null,
52
+ "forced_bos_token_id": null,
53
+ "forced_eos_token_id": null,
54
+ "grid_size": 32,
55
+ "hidden_size": 1664,
56
+ "id2label": {
57
+ "0": "LABEL_0",
58
+ "1": "LABEL_1"
59
+ },
60
+ "initializer_range": 0.02,
61
+ "intermediate_size": 5632,
62
+ "is_decoder": false,
63
+ "is_encoder_decoder": false,
64
+ "label2id": {
65
+ "LABEL_0": 0,
66
+ "LABEL_1": 1
67
+ },
68
+ "layer_norm_eps": 1e-06,
69
+ "length_penalty": 1.0,
70
+ "max_length": 20,
71
+ "min_length": 0,
72
+ "model_type": "mplug_owl_visual_abstract",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_attention_heads": 16,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 1,
77
+ "num_hidden_layers": 6,
78
+ "num_learnable_queries": 64,
79
+ "num_return_sequences": 1,
80
+ "output_attentions": false,
81
+ "output_hidden_states": false,
82
+ "output_scores": false,
83
+ "pad_token_id": null,
84
+ "prefix": null,
85
+ "problem_type": null,
86
+ "pruned_heads": {},
87
+ "remove_invalid_values": false,
88
+ "repetition_penalty": 1.0,
89
+ "return_dict": true,
90
+ "return_dict_in_generate": false,
91
+ "sep_token_id": null,
92
+ "suppress_tokens": null,
93
+ "task_specific_params": null,
94
+ "temperature": 1.0,
95
+ "tf_legacy_loss": false,
96
+ "tie_encoder_decoder": false,
97
+ "tie_word_embeddings": true,
98
+ "tokenizer_class": null,
99
+ "top_k": 50,
100
+ "top_p": 1.0,
101
+ "torch_dtype": null,
102
+ "torchscript": false,
103
+ "transformers_version": "4.28.1",
104
+ "typical_p": 1.0,
105
+ "use_bfloat16": false
106
+ },
107
+ "visual_model": {
108
+ "_name_or_path": "",
109
+ "use_cls_token": false,
110
+ "use_post_layernorm": false,
111
+ "add_cross_attention": false,
112
+ "architectures": null,
113
+ "attention_dropout": 0.0,
114
+ "bad_words_ids": null,
115
+ "begin_suppress_tokens": null,
116
+ "bos_token_id": null,
117
+ "chunk_size_feed_forward": 0,
118
+ "cross_attention_hidden_size": null,
119
+ "decoder_start_token_id": null,
120
+ "diversity_penalty": 0.0,
121
+ "do_sample": false,
122
+ "early_stopping": false,
123
+ "encoder_no_repeat_ngram_size": 0,
124
+ "eos_token_id": null,
125
+ "exponential_decay_length_penalty": null,
126
+ "finetuning_task": null,
127
+ "forced_bos_token_id": null,
128
+ "forced_eos_token_id": null,
129
+ "hidden_act": "gelu",
130
+ "hidden_size": 1664,
131
+ "id2label": {
132
+ "0": "LABEL_0",
133
+ "1": "LABEL_1"
134
+ },
135
+ "image_size": 448,
136
+ "initializer_factor": 1.0,
137
+ "initializer_range": 0.02,
138
+ "intermediate_size": 8192,
139
+ "is_decoder": false,
140
+ "is_encoder_decoder": false,
141
+ "label2id": {
142
+ "LABEL_0": 0,
143
+ "LABEL_1": 1
144
+ },
145
+ "layer_norm_eps": 1e-06,
146
+ "length_penalty": 1.0,
147
+ "max_length": 20,
148
+ "min_length": 0,
149
+ "model_type": "mplug_owl_vision_model",
150
+ "no_repeat_ngram_size": 0,
151
+ "num_attention_heads": 16,
152
+ "num_beam_groups": 1,
153
+ "num_beams": 1,
154
+ "num_channels": 3,
155
+ "num_hidden_layers": 48,
156
+ "num_return_sequences": 1,
157
+ "output_attentions": false,
158
+ "output_hidden_states": false,
159
+ "output_scores": false,
160
+ "pad_token_id": null,
161
+ "patch_size": 14,
162
+ "prefix": null,
163
+ "problem_type": null,
164
+ "projection_dim": 768,
165
+ "pruned_heads": {},
166
+ "remove_invalid_values": false,
167
+ "repetition_penalty": 1.0,
168
+ "return_dict": true,
169
+ "return_dict_in_generate": false,
170
+ "sep_token_id": null,
171
+ "suppress_tokens": null,
172
+ "task_specific_params": null,
173
+ "temperature": 1.0,
174
+ "tf_legacy_loss": false,
175
+ "tie_encoder_decoder": false,
176
+ "tie_word_embeddings": true,
177
+ "tokenizer_class": null,
178
+ "top_k": 50,
179
+ "top_p": 1.0,
180
+ "torch_dtype": null,
181
+ "torchscript": false,
182
+ "transformers_version": "4.28.1",
183
+ "typical_p": 1.0,
184
+ "use_bfloat16": false,
185
+ "use_flash_attn": false
186
+ }
187
+ },
188
+ "vocab_size": 151936
189
+ }
190
+
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"multimodal-dialogue"}
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chat_format": "raw",
3
+ "do_sample": true,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 512,
6
+ "max_window_size": 6144,
7
+ "pad_token_id": 151643,
8
+ "top_k": 0,
9
+ "top_p": 0.5,
10
+ "transformers_version": "4.31.0"
11
+ }
12
+
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 448,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "resample": 3,
18
+ "size": 448
19
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f10c8848a532540de4d89e7869c16263bdba0d6960fe6e689aaf129a89eb8ce2
3
+ size 22833806212
qwen.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
tokenization_qwen.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Tokenization classes for QWen."""
7
+
8
+ import base64
9
+ import logging
10
+ import os
11
+ import unicodedata
12
+ from typing import Collection, Dict, List, Set, Tuple, Union
13
+
14
+ import tiktoken
15
+ from transformers import PreTrainedTokenizer, AddedToken
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
+
22
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
+ ENDOFTEXT = "<|endoftext|>"
24
+ IMSTART = "<|im_start|>"
25
+ IMEND = "<|im_end|>"
26
+ # as the default behavior is changed to allow special tokens in
27
+ # regular texts, the surface forms of special tokens need to be
28
+ # as different as possible to minimize the impact
29
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
+ # changed to use actual index to avoid misconfiguration with vocabulary expansion
31
+ SPECIAL_START_ID = 151643
32
+ SPECIAL_TOKENS = tuple(
33
+ enumerate(
34
+ (
35
+ (
36
+ ENDOFTEXT,
37
+ IMSTART,
38
+ IMEND,
39
+ )
40
+ + EXTRAS
41
+ ),
42
+ start=SPECIAL_START_ID,
43
+ )
44
+ )
45
+ SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
46
+
47
+
48
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
49
+ with open(tiktoken_bpe_file, "rb") as f:
50
+ contents = f.read()
51
+ return {
52
+ base64.b64decode(token): int(rank)
53
+ for token, rank in (line.split() for line in contents.splitlines() if line)
54
+ }
55
+
56
+
57
+ class QWenTokenizer(PreTrainedTokenizer):
58
+ """QWen tokenizer."""
59
+
60
+ vocab_files_names = VOCAB_FILES_NAMES
61
+
62
+ def __init__(
63
+ self,
64
+ vocab_file,
65
+ errors="replace",
66
+ extra_vocab_file=None,
67
+ **kwargs,
68
+ ):
69
+ super().__init__(**kwargs)
70
+
71
+ # how to handle errors in decoding UTF-8 byte sequences
72
+ # use ignore if you are in streaming inference
73
+ self.errors = errors
74
+
75
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
76
+ self.special_tokens = {
77
+ token: index
78
+ for index, token in SPECIAL_TOKENS
79
+ }
80
+
81
+ # try load extra vocab from file
82
+ if extra_vocab_file is not None:
83
+ used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
84
+ extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
85
+ for token, index in extra_mergeable_ranks.items():
86
+ if token in self.mergeable_ranks:
87
+ logger.info(f"extra token {token} exists, skipping")
88
+ continue
89
+ if index in used_ids:
90
+ logger.info(f'the index {index} for extra token {token} exists, skipping')
91
+ continue
92
+ self.mergeable_ranks[token] = index
93
+ # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
94
+
95
+ enc = tiktoken.Encoding(
96
+ "Qwen",
97
+ pat_str=PAT_STR,
98
+ mergeable_ranks=self.mergeable_ranks,
99
+ special_tokens=self.special_tokens,
100
+ )
101
+ assert (
102
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
103
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
104
+
105
+ self.decoder = {
106
+ v: k for k, v in self.mergeable_ranks.items()
107
+ } # type: dict[int, bytes|str]
108
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
109
+
110
+ self.tokenizer = enc # type: tiktoken.Encoding
111
+
112
+ self.eod_id = self.tokenizer.eot_token
113
+ self.im_start_id = self.special_tokens[IMSTART]
114
+ self.im_end_id = self.special_tokens[IMEND]
115
+ self.pad_token_id = self.eod_id
116
+
117
+ def __getstate__(self):
118
+ # for pickle lovers
119
+ state = self.__dict__.copy()
120
+ del state["tokenizer"]
121
+ return state
122
+
123
+ def __setstate__(self, state):
124
+ # tokenizer is not python native; don't pass it; rebuild it
125
+ self.__dict__.update(state)
126
+ enc = tiktoken.Encoding(
127
+ "Qwen",
128
+ pat_str=PAT_STR,
129
+ mergeable_ranks=self.mergeable_ranks,
130
+ special_tokens=self.special_tokens,
131
+ )
132
+ self.tokenizer = enc
133
+
134
+ def __len__(self) -> int:
135
+ return self.tokenizer.n_vocab
136
+
137
+ def get_vocab(self) -> Dict[bytes, int]:
138
+ return self.mergeable_ranks
139
+
140
+ def convert_tokens_to_ids(
141
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
142
+ ) -> List[int]:
143
+ ids = []
144
+ if isinstance(tokens, (str, bytes)):
145
+ if tokens in self.special_tokens:
146
+ return self.special_tokens[tokens]
147
+ else:
148
+ return self.mergeable_ranks.get(tokens)
149
+ for token in tokens:
150
+ if token in self.special_tokens:
151
+ ids.append(self.special_tokens[token])
152
+ else:
153
+ ids.append(self.mergeable_ranks.get(token))
154
+ return ids
155
+
156
+ def _add_tokens(
157
+ self,
158
+ new_tokens: Union[List[str], List[AddedToken]],
159
+ special_tokens: bool = False,
160
+ ) -> int:
161
+ if not special_tokens and new_tokens:
162
+ raise ValueError("Adding regular tokens is not supported")
163
+ for token in new_tokens:
164
+ surface_form = token.content if isinstance(token, AddedToken) else token
165
+ if surface_form not in SPECIAL_TOKENS_SET:
166
+ raise ValueError("Adding unknown special tokens is not supported")
167
+ return 0
168
+
169
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
170
+ """
171
+ Save only the vocabulary of the tokenizer (vocabulary).
172
+
173
+ Returns:
174
+ `Tuple(str)`: Paths to the files saved.
175
+ """
176
+ file_path = os.path.join(save_directory, "qwen.tiktoken")
177
+ with open(file_path, "w", encoding="utf8") as w:
178
+ for k, v in self.mergeable_ranks.items():
179
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
180
+ w.write(line)
181
+ return (file_path,)
182
+
183
+ def tokenize(
184
+ self,
185
+ text: str,
186
+ allowed_special: Union[Set, str] = "all",
187
+ disallowed_special: Union[Collection, str] = (),
188
+ **kwargs,
189
+ ) -> List[Union[bytes, str]]:
190
+ """
191
+ Converts a string in a sequence of tokens.
192
+
193
+ Args:
194
+ text (`str`):
195
+ The sequence to be encoded.
196
+ allowed_special (`Literal["all"]` or `set`):
197
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
198
+ Default to "all".
199
+ disallowed_special (`Literal["all"]` or `Collection`):
200
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
201
+ Default to an empty tuple.
202
+
203
+ kwargs (additional keyword arguments, *optional*):
204
+ Will be passed to the underlying model specific encode method.
205
+
206
+ Returns:
207
+ `List[bytes|str]`: The list of tokens.
208
+ """
209
+ tokens = []
210
+ text = unicodedata.normalize("NFC", text)
211
+
212
+ # this implementation takes a detour: text -> token id -> token surface forms
213
+ for t in self.tokenizer.encode(
214
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
215
+ ):
216
+ tokens.append(self.decoder[t])
217
+ return tokens
218
+
219
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
220
+ """
221
+ Converts a sequence of tokens in a single string.
222
+ """
223
+ text = ""
224
+ temp = b""
225
+ for t in tokens:
226
+ if isinstance(t, str):
227
+ if temp:
228
+ text += temp.decode("utf-8", errors=self.errors)
229
+ temp = b""
230
+ text += t
231
+ elif isinstance(t, bytes):
232
+ temp += t
233
+ else:
234
+ raise TypeError("token should only be of type types or str")
235
+ if temp:
236
+ text += temp.decode("utf-8", errors=self.errors)
237
+ return text
238
+
239
+ @property
240
+ def vocab_size(self):
241
+ return self.tokenizer.n_vocab
242
+
243
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
244
+ """Converts an id to a token, special tokens included"""
245
+ if index in self.decoder:
246
+ return self.decoder[index]
247
+ raise ValueError("unknown ids")
248
+
249
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
250
+ """Converts a token to an id using the vocab, special tokens included"""
251
+ if token in self.special_tokens:
252
+ return self.special_tokens[token]
253
+ if token in self.mergeable_ranks:
254
+ return self.mergeable_ranks[token]
255
+ raise ValueError("unknown token")
256
+
257
+ def _tokenize(self, text: str, **kwargs):
258
+ """
259
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
260
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
261
+
262
+ Do NOT take care of added tokens.
263
+ """
264
+ raise NotImplementedError
265
+
266
+ def _decode(
267
+ self,
268
+ token_ids: Union[int, List[int]],
269
+ skip_special_tokens: bool = False,
270
+ errors: str = None,
271
+ **kwargs,
272
+ ) -> str:
273
+ if isinstance(token_ids, int):
274
+ token_ids = [token_ids]
275
+ if skip_special_tokens:
276
+ token_ids = [i for i in token_ids if i < self.eod_id]
277
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_max_length": 8192,
3
+ "tokenizer_class": "QWenTokenizer",
4
+ "auto_map": {
5
+ "AutoTokenizer": [
6
+ "tokenization_qwen.QWenTokenizer",
7
+ null
8
+ ]
9
+ }
10
+ }
11
+