Upload folder using huggingface_hub
Browse files- config.json +3 -2
- modeling_internlm2.py +182 -0
- modeling_internvl_chat.py +2 -3
- tokenization_internlm2_fast.py +211 -0
config.json
CHANGED
@@ -11,6 +11,7 @@
|
|
11 |
"downsample_ratio": 0.5,
|
12 |
"dynamic_image_size": true,
|
13 |
"force_image_size": 448,
|
|
|
14 |
"llm_config": {
|
15 |
"_name_or_path": "pretrained/internlm2-chat-20b/",
|
16 |
"add_cross_attention": false,
|
@@ -100,7 +101,7 @@
|
|
100 |
"use_cache": false,
|
101 |
"vocab_size": 92553
|
102 |
},
|
103 |
-
"max_dynamic_patch":
|
104 |
"min_dynamic_patch": 1,
|
105 |
"model_type": "internvl_chat",
|
106 |
"pad2square": false,
|
@@ -125,7 +126,7 @@
|
|
125 |
"use_llm_lora": 0,
|
126 |
"use_thumbnail": true,
|
127 |
"vision_config": {
|
128 |
-
"_name_or_path": "
|
129 |
"add_cross_attention": false,
|
130 |
"architectures": [
|
131 |
"InternVisionModel"
|
|
|
11 |
"downsample_ratio": 0.5,
|
12 |
"dynamic_image_size": true,
|
13 |
"force_image_size": 448,
|
14 |
+
"image_fold": null,
|
15 |
"llm_config": {
|
16 |
"_name_or_path": "pretrained/internlm2-chat-20b/",
|
17 |
"add_cross_attention": false,
|
|
|
101 |
"use_cache": false,
|
102 |
"vocab_size": 92553
|
103 |
},
|
104 |
+
"max_dynamic_patch": 6,
|
105 |
"min_dynamic_patch": 1,
|
106 |
"model_type": "internvl_chat",
|
107 |
"pad2square": false,
|
|
|
126 |
"use_llm_lora": 0,
|
127 |
"use_thumbnail": true,
|
128 |
"vision_config": {
|
129 |
+
"_name_or_path": "work_dirs/internvl_chat_internlm2_20b_448_dynamic_chinese_pretrain/checkpoint-5200-vit",
|
130 |
"add_cross_attention": false,
|
131 |
"architectures": [
|
132 |
"InternVisionModel"
|
modeling_internlm2.py
CHANGED
@@ -39,6 +39,20 @@ try:
|
|
39 |
from transformers.generation.streamers import BaseStreamer
|
40 |
except: # noqa # pylint: disable=bare-except
|
41 |
BaseStreamer = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
from .configuration_internlm2 import InternLM2Config
|
44 |
|
@@ -1272,6 +1286,174 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
|
|
1272 |
|
1273 |
return consumer()
|
1274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1275 |
|
1276 |
# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
|
1277 |
@add_start_docstrings(
|
|
|
39 |
from transformers.generation.streamers import BaseStreamer
|
40 |
except: # noqa # pylint: disable=bare-except
|
41 |
BaseStreamer = None
|
42 |
+
from typing import Any, List, Optional, Tuple, Union
|
43 |
+
import torch.distributed as dist
|
44 |
+
import torch.utils.checkpoint
|
45 |
+
from peft import LoraConfig, get_peft_model
|
46 |
+
from torch import nn
|
47 |
+
from torch.nn import CrossEntropyLoss
|
48 |
+
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
|
49 |
+
from transformers.generation.logits_process import LogitsProcessorList
|
50 |
+
from transformers.generation.stopping_criteria import StoppingCriteriaList
|
51 |
+
from transformers.generation.streamers import BaseStreamer
|
52 |
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
53 |
+
from transformers.modeling_utils import PreTrainedModel
|
54 |
+
from transformers.utils import ModelOutput, logging
|
55 |
+
from transformers.generation.utils import GreedySearchOutput, validate_stopping_criteria, GreedySearchDecoderOnlyOutput, GreedySearchEncoderDecoderOutput
|
56 |
|
57 |
from .configuration_internlm2 import InternLM2Config
|
58 |
|
|
|
1286 |
|
1287 |
return consumer()
|
1288 |
|
1289 |
+
def greedy_search(
|
1290 |
+
self,
|
1291 |
+
input_ids: torch.LongTensor,
|
1292 |
+
logits_processor: Optional[LogitsProcessorList] = None,
|
1293 |
+
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
1294 |
+
max_length: Optional[int] = None,
|
1295 |
+
pad_token_id: Optional[int] = None,
|
1296 |
+
eos_token_id: Optional[Union[int, List[int]]] = None,
|
1297 |
+
output_attentions: Optional[bool] = None,
|
1298 |
+
output_hidden_states: Optional[bool] = None,
|
1299 |
+
output_scores: Optional[bool] = None,
|
1300 |
+
return_dict_in_generate: Optional[bool] = None,
|
1301 |
+
synced_gpus: bool = False,
|
1302 |
+
streamer: Optional["BaseStreamer"] = None,
|
1303 |
+
**model_kwargs,
|
1304 |
+
) -> Union[GreedySearchOutput, torch.LongTensor]:
|
1305 |
+
# init values
|
1306 |
+
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
|
1307 |
+
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
|
1308 |
+
if max_length is not None:
|
1309 |
+
warnings.warn(
|
1310 |
+
"`max_length` is deprecated in this function, use"
|
1311 |
+
" `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
|
1312 |
+
UserWarning,
|
1313 |
+
)
|
1314 |
+
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
|
1315 |
+
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
|
1316 |
+
eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
|
1317 |
+
if isinstance(eos_token_id, int):
|
1318 |
+
eos_token_id = [eos_token_id]
|
1319 |
+
eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
|
1320 |
+
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
|
1321 |
+
output_attentions = (
|
1322 |
+
output_attentions if output_attentions is not None else self.generation_config.output_attentions
|
1323 |
+
)
|
1324 |
+
output_hidden_states = (
|
1325 |
+
output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
|
1326 |
+
)
|
1327 |
+
return_dict_in_generate = (
|
1328 |
+
return_dict_in_generate
|
1329 |
+
if return_dict_in_generate is not None
|
1330 |
+
else self.generation_config.return_dict_in_generate
|
1331 |
+
)
|
1332 |
+
|
1333 |
+
# init attention / hidden states / scores tuples
|
1334 |
+
scores = () if (return_dict_in_generate and output_scores) else None
|
1335 |
+
decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
|
1336 |
+
cross_attentions = () if (return_dict_in_generate and output_attentions) else None
|
1337 |
+
decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
|
1338 |
+
|
1339 |
+
# if model is an encoder-decoder, retrieve encoder attention weights and hidden states
|
1340 |
+
if return_dict_in_generate and self.config.is_encoder_decoder:
|
1341 |
+
encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
|
1342 |
+
encoder_hidden_states = (
|
1343 |
+
model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
|
1344 |
+
)
|
1345 |
+
|
1346 |
+
# keep track of which sequences are already finished
|
1347 |
+
unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
|
1348 |
+
|
1349 |
+
this_peer_finished = False # used by synced_gpus only
|
1350 |
+
while True:
|
1351 |
+
if synced_gpus:
|
1352 |
+
# Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
|
1353 |
+
# The following logic allows an early break if all peers finished generating their sequence
|
1354 |
+
this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
|
1355 |
+
# send 0.0 if we finished, 1.0 otherwise
|
1356 |
+
dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
|
1357 |
+
# did all peers finish? the reduced sum will be 0.0 then
|
1358 |
+
if this_peer_finished_flag.item() == 0.0:
|
1359 |
+
break
|
1360 |
+
|
1361 |
+
# prepare model inputs
|
1362 |
+
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
1363 |
+
|
1364 |
+
# forward pass to get next token
|
1365 |
+
outputs = self(
|
1366 |
+
**model_inputs,
|
1367 |
+
return_dict=True,
|
1368 |
+
output_attentions=output_attentions,
|
1369 |
+
output_hidden_states=output_hidden_states,
|
1370 |
+
)
|
1371 |
+
|
1372 |
+
if synced_gpus and this_peer_finished:
|
1373 |
+
continue # don't waste resources running the code we don't need
|
1374 |
+
|
1375 |
+
next_token_logits = outputs.logits[:, -1, :]
|
1376 |
+
|
1377 |
+
# pre-process distribution
|
1378 |
+
next_tokens_scores = logits_processor(input_ids, next_token_logits)
|
1379 |
+
|
1380 |
+
# Store scores, attentions and hidden_states when required
|
1381 |
+
if return_dict_in_generate:
|
1382 |
+
if output_scores:
|
1383 |
+
scores += (next_tokens_scores,)
|
1384 |
+
if output_attentions:
|
1385 |
+
decoder_attentions += (
|
1386 |
+
(outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
|
1387 |
+
)
|
1388 |
+
if self.config.is_encoder_decoder:
|
1389 |
+
cross_attentions += (outputs.cross_attentions,)
|
1390 |
+
|
1391 |
+
if output_hidden_states:
|
1392 |
+
decoder_hidden_states += (
|
1393 |
+
(outputs.decoder_hidden_states,)
|
1394 |
+
if self.config.is_encoder_decoder
|
1395 |
+
else (outputs.hidden_states,)
|
1396 |
+
)
|
1397 |
+
|
1398 |
+
# argmax
|
1399 |
+
next_tokens = torch.argmax(next_tokens_scores, dim=-1).to(device=input_ids.device)
|
1400 |
+
# finished sentences should have their next token be a padding token
|
1401 |
+
if eos_token_id is not None:
|
1402 |
+
if pad_token_id is None:
|
1403 |
+
raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
|
1404 |
+
next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
|
1405 |
+
|
1406 |
+
# update generated ids, model inputs, and length for next step
|
1407 |
+
input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
|
1408 |
+
if streamer is not None:
|
1409 |
+
streamer.put(next_tokens.cpu())
|
1410 |
+
model_kwargs = self._update_model_kwargs_for_generation(
|
1411 |
+
outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
|
1412 |
+
)
|
1413 |
+
|
1414 |
+
# if eos_token was found in one sentence, set sentence to finished
|
1415 |
+
if eos_token_id_tensor is not None:
|
1416 |
+
unfinished_sequences = unfinished_sequences.mul(
|
1417 |
+
next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
|
1418 |
+
)
|
1419 |
+
|
1420 |
+
# stop when each sentence is finished
|
1421 |
+
if unfinished_sequences.max() == 0:
|
1422 |
+
this_peer_finished = True
|
1423 |
+
|
1424 |
+
# stop if we exceed the maximum length
|
1425 |
+
if stopping_criteria(input_ids, scores):
|
1426 |
+
this_peer_finished = True
|
1427 |
+
|
1428 |
+
if this_peer_finished and not synced_gpus:
|
1429 |
+
break
|
1430 |
+
|
1431 |
+
if streamer is not None:
|
1432 |
+
streamer.end()
|
1433 |
+
|
1434 |
+
if return_dict_in_generate:
|
1435 |
+
if self.config.is_encoder_decoder:
|
1436 |
+
return GreedySearchEncoderDecoderOutput(
|
1437 |
+
sequences=input_ids,
|
1438 |
+
scores=scores,
|
1439 |
+
encoder_attentions=encoder_attentions,
|
1440 |
+
encoder_hidden_states=encoder_hidden_states,
|
1441 |
+
decoder_attentions=decoder_attentions,
|
1442 |
+
cross_attentions=cross_attentions,
|
1443 |
+
decoder_hidden_states=decoder_hidden_states,
|
1444 |
+
past_key_values=model_kwargs.get("past_key_values"),
|
1445 |
+
)
|
1446 |
+
else:
|
1447 |
+
return GreedySearchDecoderOnlyOutput(
|
1448 |
+
sequences=input_ids,
|
1449 |
+
scores=scores,
|
1450 |
+
attentions=decoder_attentions,
|
1451 |
+
hidden_states=decoder_hidden_states,
|
1452 |
+
past_key_values=model_kwargs.get("past_key_values"),
|
1453 |
+
)
|
1454 |
+
else:
|
1455 |
+
return input_ids
|
1456 |
+
|
1457 |
|
1458 |
# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
|
1459 |
@add_start_docstrings(
|
modeling_internvl_chat.py
CHANGED
@@ -26,7 +26,7 @@ logger = logging.get_logger(__name__)
|
|
26 |
class InternVLChatModel(PreTrainedModel):
|
27 |
config_class = InternVLChatConfig
|
28 |
main_input_name = 'pixel_values'
|
29 |
-
_no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer'
|
30 |
|
31 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
|
32 |
super().__init__(config)
|
@@ -337,7 +337,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
337 |
vit_embeds = visual_features
|
338 |
else:
|
339 |
vit_embeds = self.extract_feature(pixel_values)
|
340 |
-
|
341 |
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
342 |
B, N, C = input_embeds.shape
|
343 |
input_embeds = input_embeds.reshape(B * N, C)
|
@@ -345,7 +344,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
345 |
input_ids = input_ids.reshape(B * N)
|
346 |
selected = (input_ids == self.img_context_token_id)
|
347 |
assert selected.sum() != 0
|
348 |
-
input_embeds[selected] = vit_embeds.reshape(-1, C)
|
349 |
|
350 |
input_embeds = input_embeds.reshape(B, N, C)
|
351 |
else:
|
|
|
26 |
class InternVLChatModel(PreTrainedModel):
|
27 |
config_class = InternVLChatConfig
|
28 |
main_input_name = 'pixel_values'
|
29 |
+
_no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer']
|
30 |
|
31 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
|
32 |
super().__init__(config)
|
|
|
337 |
vit_embeds = visual_features
|
338 |
else:
|
339 |
vit_embeds = self.extract_feature(pixel_values)
|
|
|
340 |
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
341 |
B, N, C = input_embeds.shape
|
342 |
input_embeds = input_embeds.reshape(B * N, C)
|
|
|
344 |
input_ids = input_ids.reshape(B * N)
|
345 |
selected = (input_ids == self.img_context_token_id)
|
346 |
assert selected.sum() != 0
|
347 |
+
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
|
348 |
|
349 |
input_embeds = input_embeds.reshape(B, N, C)
|
350 |
else:
|
tokenization_internlm2_fast.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
|
2 |
+
#
|
3 |
+
# This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
|
17 |
+
"""Tokenization Fast class for InternLM."""
|
18 |
+
import os
|
19 |
+
from shutil import copyfile
|
20 |
+
from typing import Any, Dict, Optional, Tuple
|
21 |
+
|
22 |
+
from tokenizers import Tokenizer, decoders, normalizers, processors
|
23 |
+
from tokenizers.models import BPE
|
24 |
+
from transformers.convert_slow_tokenizer import (SLOW_TO_FAST_CONVERTERS,
|
25 |
+
SentencePieceExtractor,
|
26 |
+
SpmConverter)
|
27 |
+
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
|
28 |
+
from transformers.utils import logging
|
29 |
+
|
30 |
+
from .tokenization_internlm2 import InternLM2Tokenizer
|
31 |
+
|
32 |
+
logger = logging.get_logger(__name__)
|
33 |
+
|
34 |
+
VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
|
35 |
+
|
36 |
+
|
37 |
+
# Modified from transformers.convert_slow_tokenizer.LlamaConverter
|
38 |
+
class InternLM2Converter(SpmConverter):
|
39 |
+
handle_byte_fallback = True
|
40 |
+
|
41 |
+
def vocab(self, proto):
|
42 |
+
vocab = [
|
43 |
+
('<unk>', 0.0),
|
44 |
+
('<s>', 0.0),
|
45 |
+
('</s>', 0.0),
|
46 |
+
]
|
47 |
+
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
|
48 |
+
return vocab
|
49 |
+
|
50 |
+
def unk_id(self, proto):
|
51 |
+
unk_id = 0
|
52 |
+
return unk_id
|
53 |
+
|
54 |
+
def decoder(self, replacement, add_prefix_space):
|
55 |
+
return decoders.Sequence(
|
56 |
+
[
|
57 |
+
decoders.Replace('▁', ' '),
|
58 |
+
decoders.ByteFallback(),
|
59 |
+
decoders.Fuse(),
|
60 |
+
decoders.Strip(content=' ', left=1),
|
61 |
+
]
|
62 |
+
)
|
63 |
+
|
64 |
+
def tokenizer(self, proto):
|
65 |
+
model_type = proto.trainer_spec.model_type
|
66 |
+
vocab_scores = self.vocab(proto)
|
67 |
+
# special tokens
|
68 |
+
added_tokens = self.original_tokenizer.added_tokens_decoder
|
69 |
+
for i in range(len(vocab_scores)):
|
70 |
+
piece, score = vocab_scores[i]
|
71 |
+
if i in added_tokens:
|
72 |
+
vocab_scores[i] = (added_tokens[i].content, score)
|
73 |
+
if model_type == 1:
|
74 |
+
raise RuntimeError('InternLM2 is supposed to be a BPE model!')
|
75 |
+
|
76 |
+
elif model_type == 2:
|
77 |
+
_, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
|
78 |
+
bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
|
79 |
+
tokenizer = Tokenizer(
|
80 |
+
BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
|
81 |
+
)
|
82 |
+
tokenizer.add_special_tokens(
|
83 |
+
[ added_token for index, added_token in added_tokens.items()]
|
84 |
+
)
|
85 |
+
else:
|
86 |
+
raise Exception(
|
87 |
+
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
88 |
+
)
|
89 |
+
|
90 |
+
return tokenizer
|
91 |
+
|
92 |
+
def normalizer(self, proto):
|
93 |
+
normalizers_list = []
|
94 |
+
if proto.normalizer_spec.add_dummy_prefix:
|
95 |
+
normalizers_list.append(normalizers.Prepend(prepend='▁'))
|
96 |
+
normalizers_list.append(normalizers.Replace(pattern=' ', content='▁'))
|
97 |
+
return normalizers.Sequence(normalizers_list)
|
98 |
+
|
99 |
+
def pre_tokenizer(self, replacement, add_prefix_space):
|
100 |
+
return None
|
101 |
+
|
102 |
+
|
103 |
+
SLOW_TO_FAST_CONVERTERS['InternLM2Tokenizer'] = InternLM2Converter
|
104 |
+
|
105 |
+
|
106 |
+
# Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
|
107 |
+
class InternLM2TokenizerFast(PreTrainedTokenizerFast):
|
108 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
109 |
+
slow_tokenizer_class = InternLM2Tokenizer
|
110 |
+
padding_side = 'left'
|
111 |
+
model_input_names = ['input_ids', 'attention_mask']
|
112 |
+
_auto_class = 'AutoTokenizer'
|
113 |
+
|
114 |
+
def __init__(
|
115 |
+
self,
|
116 |
+
vocab_file,
|
117 |
+
unk_token='<unk>',
|
118 |
+
bos_token='<s>',
|
119 |
+
eos_token='</s>',
|
120 |
+
pad_token='</s>',
|
121 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
122 |
+
add_bos_token=True,
|
123 |
+
add_eos_token=False,
|
124 |
+
decode_with_prefix_space=False,
|
125 |
+
clean_up_tokenization_spaces=False,
|
126 |
+
**kwargs,
|
127 |
+
):
|
128 |
+
super().__init__(
|
129 |
+
vocab_file=vocab_file,
|
130 |
+
unk_token=unk_token,
|
131 |
+
bos_token=bos_token,
|
132 |
+
eos_token=eos_token,
|
133 |
+
pad_token=pad_token,
|
134 |
+
sp_model_kwargs=sp_model_kwargs,
|
135 |
+
add_bos_token=add_bos_token,
|
136 |
+
add_eos_token=add_eos_token,
|
137 |
+
decode_with_prefix_space=decode_with_prefix_space,
|
138 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
139 |
+
**kwargs,
|
140 |
+
)
|
141 |
+
self._add_bos_token = add_bos_token
|
142 |
+
self._add_eos_token = add_eos_token
|
143 |
+
self.update_post_processor()
|
144 |
+
self.vocab_file = vocab_file
|
145 |
+
|
146 |
+
@property
|
147 |
+
def can_save_slow_tokenizer(self) -> bool:
|
148 |
+
return os.path.isfile(self.vocab_file) if self.vocab_file else False
|
149 |
+
|
150 |
+
def update_post_processor(self):
|
151 |
+
"""
|
152 |
+
Updates the underlying post processor with the current `bos_token` and `eos_token`.
|
153 |
+
"""
|
154 |
+
bos = self.bos_token
|
155 |
+
bos_token_id = self.bos_token_id
|
156 |
+
if bos is None and self.add_bos_token:
|
157 |
+
raise ValueError('add_bos_token = True but bos_token = None')
|
158 |
+
|
159 |
+
eos = self.eos_token
|
160 |
+
eos_token_id = self.eos_token_id
|
161 |
+
if eos is None and self.add_eos_token:
|
162 |
+
raise ValueError('add_eos_token = True but eos_token = None')
|
163 |
+
|
164 |
+
single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
|
165 |
+
pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
|
166 |
+
|
167 |
+
special_tokens = []
|
168 |
+
if self.add_bos_token:
|
169 |
+
special_tokens.append((bos, bos_token_id))
|
170 |
+
if self.add_eos_token:
|
171 |
+
special_tokens.append((eos, eos_token_id))
|
172 |
+
self._tokenizer.post_processor = processors.TemplateProcessing(
|
173 |
+
single=single, pair=pair, special_tokens=special_tokens
|
174 |
+
)
|
175 |
+
|
176 |
+
@property
|
177 |
+
def add_eos_token(self):
|
178 |
+
return self._add_eos_token
|
179 |
+
|
180 |
+
@property
|
181 |
+
def add_bos_token(self):
|
182 |
+
return self._add_bos_token
|
183 |
+
|
184 |
+
@add_eos_token.setter
|
185 |
+
def add_eos_token(self, value):
|
186 |
+
self._add_eos_token = value
|
187 |
+
self.update_post_processor()
|
188 |
+
|
189 |
+
@add_bos_token.setter
|
190 |
+
def add_bos_token(self, value):
|
191 |
+
self._add_bos_token = value
|
192 |
+
self.update_post_processor()
|
193 |
+
|
194 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
195 |
+
if not self.can_save_slow_tokenizer:
|
196 |
+
raise ValueError(
|
197 |
+
'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
|
198 |
+
'tokenizer.'
|
199 |
+
)
|
200 |
+
|
201 |
+
if not os.path.isdir(save_directory):
|
202 |
+
logger.error(f'Vocabulary path ({save_directory}) should be a directory')
|
203 |
+
return
|
204 |
+
out_vocab_file = os.path.join(
|
205 |
+
save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
|
206 |
+
)
|
207 |
+
|
208 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
209 |
+
copyfile(self.vocab_file, out_vocab_file)
|
210 |
+
|
211 |
+
return (out_vocab_file,)
|