RangiLyu commited on
Commit
8542548
·
verified ·
1 Parent(s): 38c4c6e

Update special tokens (#2)

Browse files

- Update special tokens (68a6e2f18ff1cd0bb761ca73970f6e401ab58b30)
- add chatml template (72a7eb13d03ddfca99796c2149b9ca93cb0fdb58)
- add bos in chat template (110d79889bc7efeb76103f4b6feb2725a6763d53)
- update chat template in model (e0b670360500a66f2d1ac42e2b1ea24f70aad8d1)

Files changed (2) hide show
  1. modeling_internlm2.py +6 -6
  2. tokenizer_config.json +77 -2
modeling_internlm2.py CHANGED
@@ -1138,12 +1138,12 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
1138
  def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
1139
  prompt = ""
1140
  if meta_instruction:
1141
- prompt += f"""<s>[UNUSED_TOKEN_146]system\n{meta_instruction}[UNUSED_TOKEN_145]\n"""
1142
  else:
1143
  prompt += "<s>"
1144
  for record in history:
1145
- prompt += f"""[UNUSED_TOKEN_146]user\n{record[0]}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n{record[1]}[UNUSED_TOKEN_145]\n"""
1146
- prompt += f"""[UNUSED_TOKEN_146]user\n{query}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"""
1147
  return tokenizer([prompt], return_tensors="pt")
1148
 
1149
  @torch.no_grad()
@@ -1165,7 +1165,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
1165
  inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
1166
  inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
1167
  # also add end-of-assistant token in eos token id to avoid unnecessary generation
1168
- eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["[UNUSED_TOKEN_145]"])[0]]
1169
  outputs = self.generate(
1170
  **inputs,
1171
  streamer=streamer,
@@ -1178,7 +1178,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
1178
  )
1179
  outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
1180
  response = tokenizer.decode(outputs, skip_special_tokens=True)
1181
- response = response.split("[UNUSED_TOKEN_145]")[0]
1182
  history = history + [(query, response)]
1183
  return response, history
1184
 
@@ -1231,7 +1231,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
1231
  return
1232
 
1233
  token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
1234
- if token.strip() != "[UNUSED_TOKEN_145]":
1235
  self.response = self.response + token
1236
  history = self.history + [(self.query, self.response)]
1237
  self.queue.put((self.response, history))
 
1138
  def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
1139
  prompt = ""
1140
  if meta_instruction:
1141
+ prompt += f"""<s><|im_start|>system\n{meta_instruction}<|im_end|>\n"""
1142
  else:
1143
  prompt += "<s>"
1144
  for record in history:
1145
+ prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
1146
+ prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
1147
  return tokenizer([prompt], return_tensors="pt")
1148
 
1149
  @torch.no_grad()
 
1165
  inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
1166
  inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
1167
  # also add end-of-assistant token in eos token id to avoid unnecessary generation
1168
+ eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
1169
  outputs = self.generate(
1170
  **inputs,
1171
  streamer=streamer,
 
1178
  )
1179
  outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
1180
  response = tokenizer.decode(outputs, skip_special_tokens=True)
1181
+ response = response.split("<|im_end|>")[0]
1182
  history = history + [(query, response)]
1183
  return response, history
1184
 
 
1231
  return
1232
 
1233
  token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
1234
+ if token.strip() != "<|im_end|>":
1235
  self.response = self.response + token
1236
  history = self.history + [(self.query, self.response)]
1237
  self.queue.put((self.response, history))
tokenizer_config.json CHANGED
@@ -11,5 +11,80 @@
11
  "model_max_length": 1000000000000000019884624838656,
12
  "pad_token": "</s>",
13
  "tokenizer_class": "InternLMTokenizer",
14
- "unk_token": "<unk>"
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  "model_max_length": 1000000000000000019884624838656,
12
  "pad_token": "</s>",
13
  "tokenizer_class": "InternLMTokenizer",
14
+ "unk_token": "<unk>",
15
+ "added_tokens_decoder": {
16
+ "0": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false,
22
+ "special": true
23
+ },
24
+ "1": {
25
+ "content": "<s>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false,
30
+ "special": true
31
+ },
32
+ "2": {
33
+ "content": "</s>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false,
38
+ "special": true
39
+ },
40
+ "92543": {
41
+ "content": "<|im_start|>",
42
+ "lstrip": false,
43
+ "normalized": false,
44
+ "rstrip": false,
45
+ "single_word": false,
46
+ "special": true
47
+ },
48
+ "92542": {
49
+ "content": "<|im_end|>",
50
+ "lstrip": false,
51
+ "normalized": false,
52
+ "rstrip": false,
53
+ "single_word": false,
54
+ "special": true
55
+ },
56
+ "92541": {
57
+ "content": "<|action_start|>",
58
+ "lstrip": false,
59
+ "normalized": false,
60
+ "rstrip": false,
61
+ "single_word": false,
62
+ "special": true
63
+ },
64
+ "92540": {
65
+ "content": "<|action_end|>",
66
+ "lstrip": false,
67
+ "normalized": false,
68
+ "rstrip": false,
69
+ "single_word": false,
70
+ "special": true
71
+ },
72
+ "92539": {
73
+ "content": "<|interpreter|>",
74
+ "lstrip": false,
75
+ "normalized": false,
76
+ "rstrip": false,
77
+ "single_word": false,
78
+ "special": true
79
+ },
80
+ "92538": {
81
+ "content": "<|plugin|>",
82
+ "lstrip": false,
83
+ "normalized": false,
84
+ "rstrip": false,
85
+ "single_word": false,
86
+ "special": true
87
+ }
88
+ },
89
+ "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
90
+ }