ZwwWayne commited on
Commit
08fa4ec
2 Parent(s): d252c2d fc9cd70

Merge branch 'main' of https://huggingface.co/internlm/internlm2-chat-1_8b-sft into main

Browse files
special_tokens_map.json CHANGED
@@ -35,4 +35,4 @@
35
  "rstrip": false,
36
  "single_word": false
37
  }
38
- }
 
35
  "rstrip": false,
36
  "single_word": false
37
  }
38
+ }
tokenization_internlm2_fast.py CHANGED
@@ -56,14 +56,14 @@ class InternLM2Converter(SpmConverter):
56
  return unk_id
57
 
58
  def decoder(self, replacement, add_prefix_space):
59
- return decoders.Sequence(
60
- [
61
- decoders.Replace("▁", " "),
62
- decoders.ByteFallback(),
63
- decoders.Fuse(),
64
- decoders.Strip(content=" ", left=1),
65
- ]
66
- )
67
 
68
  def tokenizer(self, proto):
69
  model_type = proto.trainer_spec.model_type
 
56
  return unk_id
57
 
58
  def decoder(self, replacement, add_prefix_space):
59
+ decoders_sequence = [
60
+ decoders.Replace("▁", " "),
61
+ decoders.ByteFallback(),
62
+ decoders.Fuse(),
63
+ ]
64
+ if self.proto.normalizer_spec.add_dummy_prefix:
65
+ decoders_sequence.append(decoders.Strip(content=" ", left=1))
66
+ return decoders.Sequence(decoders_sequence)
67
 
68
  def tokenizer(self, proto):
69
  model_type = proto.trainer_spec.model_type
tokenizer_config.json CHANGED
@@ -99,4 +99,4 @@
99
  "sp_model_kwargs": null,
100
  "tokenizer_class": "InternLM2Tokenizer",
101
  "unk_token": "<unk>"
102
- }
 
99
  "sp_model_kwargs": null,
100
  "tokenizer_class": "InternLM2Tokenizer",
101
  "unk_token": "<unk>"
102
+ }