rafaelgeraldini commited on
Commit
a921d0f
1 Parent(s): d8d56fe

Upload Tokenizer v3

Browse files
Files changed (4) hide show
  1. README.md +5 -5
  2. special_tokens_map.json +33 -4
  3. tokenizer.json +10 -0
  4. tokenizer_config.json +17 -1
README.md CHANGED
@@ -1,16 +1,16 @@
1
  ---
2
- library_name: transformers
3
- base_model: codellama/CodeLlama-7b-Instruct-hf
4
- license: llama2
5
- datasets:
6
- - semantixai/Test-Dataset-Lloro
7
  language:
8
  - pt
 
 
9
  tags:
10
  - code
11
  - analytics
12
  - analise-dados
13
  - portugues-BR
 
 
 
14
  ---
15
 
16
  **Lloro 7B**
 
1
  ---
 
 
 
 
 
2
  language:
3
  - pt
4
+ license: llama2
5
+ library_name: transformers
6
  tags:
7
  - code
8
  - analytics
9
  - analise-dados
10
  - portugues-BR
11
+ base_model: codellama/CodeLlama-7b-Instruct-hf
12
+ datasets:
13
+ - semantixai/Test-Dataset-Lloro
14
  ---
15
 
16
  **Lloro 7B**
special_tokens_map.json CHANGED
@@ -7,13 +7,42 @@
7
  "▁<SUF>",
8
  "▁<MID>",
9
  "▁<EOT>",
 
 
 
 
 
10
  "▁<PRE>",
11
  "▁<MID>",
12
  "▁<SUF>",
13
  "▁<EOT>"
14
  ],
15
- "bos_token": "<s>",
16
- "eos_token": "</s>",
17
- "pad_token": "<unk>",
18
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  }
 
7
  "▁<SUF>",
8
  "▁<MID>",
9
  "▁<EOT>",
10
+ "[PAD]",
11
+ "▁<PRE>",
12
+ "▁<MID>",
13
+ "▁<SUF>",
14
+ "▁<EOT>",
15
  "▁<PRE>",
16
  "▁<MID>",
17
  "▁<SUF>",
18
  "▁<EOT>"
19
  ],
20
+ "bos_token": {
21
+ "content": "<s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "eos_token": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "pad_token": {
35
+ "content": "[PAD]",
36
+ "lstrip": false,
37
+ "normalized": false,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ },
41
+ "unk_token": {
42
+ "content": "<unk>",
43
+ "lstrip": false,
44
+ "normalized": false,
45
+ "rstrip": false,
46
+ "single_word": false
47
+ }
48
  }
tokenizer.json CHANGED
@@ -65,6 +65,15 @@
65
  "rstrip": true,
66
  "normalized": false,
67
  "special": true
 
 
 
 
 
 
 
 
 
68
  }
69
  ],
70
  "normalizer": {
@@ -170,6 +179,7 @@
170
  "end_of_word_suffix": null,
171
  "fuse_unk": true,
172
  "byte_fallback": true,
 
173
  "vocab": {
174
  "<unk>": 0,
175
  "<s>": 1,
 
65
  "rstrip": true,
66
  "normalized": false,
67
  "special": true
68
+ },
69
+ {
70
+ "id": 32016,
71
+ "content": "[PAD]",
72
+ "single_word": false,
73
+ "lstrip": false,
74
+ "rstrip": false,
75
+ "normalized": false,
76
+ "special": true
77
  }
78
  ],
79
  "normalizer": {
 
179
  "end_of_word_suffix": null,
180
  "fuse_unk": true,
181
  "byte_fallback": true,
182
+ "ignore_merges": false,
183
  "vocab": {
184
  "<unk>": 0,
185
  "<s>": 1,
tokenizer_config.json CHANGED
@@ -1,4 +1,6 @@
1
  {
 
 
2
  "added_tokens_decoder": {
3
  "0": {
4
  "content": "<unk>",
@@ -55,6 +57,14 @@
55
  "rstrip": true,
56
  "single_word": false,
57
  "special": true
 
 
 
 
 
 
 
 
58
  }
59
  },
60
  "additional_special_tokens": [
@@ -65,12 +75,18 @@
65
  "▁<SUF>",
66
  "▁<MID>",
67
  "▁<EOT>",
 
 
 
 
 
68
  "▁<PRE>",
69
  "▁<MID>",
70
  "▁<SUF>",
71
  "▁<EOT>"
72
  ],
73
  "bos_token": "<s>",
 
74
  "clean_up_tokenization_spaces": false,
75
  "eos_token": "</s>",
76
  "eot_token": "▁<EOT>",
@@ -78,7 +94,7 @@
78
  "legacy": null,
79
  "middle_token": "▁<MID>",
80
  "model_max_length": 1000000000000000019884624838656,
81
- "pad_token": "<unk>",
82
  "prefix_token": "▁<PRE>",
83
  "sp_model_kwargs": {},
84
  "suffix_token": "▁<SUF>",
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
 
57
  "rstrip": true,
58
  "single_word": false,
59
  "special": true
60
+ },
61
+ "32016": {
62
+ "content": "[PAD]",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
  }
69
  },
70
  "additional_special_tokens": [
 
75
  "▁<SUF>",
76
  "▁<MID>",
77
  "▁<EOT>",
78
+ "[PAD]",
79
+ "▁<PRE>",
80
+ "▁<MID>",
81
+ "▁<SUF>",
82
+ "▁<EOT>",
83
  "▁<PRE>",
84
  "▁<MID>",
85
  "▁<SUF>",
86
  "▁<EOT>"
87
  ],
88
  "bos_token": "<s>",
89
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
90
  "clean_up_tokenization_spaces": false,
91
  "eos_token": "</s>",
92
  "eot_token": "▁<EOT>",
 
94
  "legacy": null,
95
  "middle_token": "▁<MID>",
96
  "model_max_length": 1000000000000000019884624838656,
97
+ "pad_token": "[PAD]",
98
  "prefix_token": "▁<PRE>",
99
  "sp_model_kwargs": {},
100
  "suffix_token": "▁<SUF>",