Text Generation
Safetensors
Spanish
Paraguay
Culture
Custom Code
Guaraní
unsloth
enpaiva commited on
Commit
22dd552
1 Parent(s): d153a99

Upload tokenizer

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer-model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
tokenizer-model/special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<start_of_turn>",
4
+ "<end_of_turn>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<bos>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<pad>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
tokenizer-model/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05e97791a5e007260de1db7e1692e53150e08cea481e2bf25435553380c147ee
3
+ size 17477929
tokenizer-model/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
3
+ size 4241003
tokenizer-model/tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "106": {
38
+ "content": "<start_of_turn>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "107": {
46
+ "content": "<end_of_turn>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<start_of_turn>",
56
+ "<end_of_turn>"
57
+ ],
58
+ "bos_token": "<bos>",
59
+ "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
60
+ "clean_up_tokenization_spaces": false,
61
+ "eos_token": "<eos>",
62
+ "legacy": null,
63
+ "model_max_length": 1000000000000000019884624838656,
64
+ "pad_token": "<pad>",
65
+ "sp_model_kwargs": {},
66
+ "spaces_between_special_tokens": false,
67
+ "tokenizer_class": "GemmaTokenizer",
68
+ "unk_token": "<unk>",
69
+ "use_default_system_prompt": false
70
+ }
tokenizer-retriever/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer-retriever/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer-retriever/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[MASK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[PAD]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "3": {
20
+ "content": "[UNK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "4": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "5": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": false,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "BertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
tokenizer-retriever/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {},
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "tokenizer_keeper.KeeperTokenizer",
6
+ null
7
+ ]
8
+ },
9
+ "clean_up_tokenization_spaces": true,
10
+ "model_max_length": 1000000000000000019884624838656,
11
+ "tokenizer_class": "KeeperTokenizer"
12
+ }
tokenizer_keeper.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ import torch
5
+ from transformers import (
6
+ AutoTokenizer,
7
+ PreTrainedTokenizer
8
+ )
9
+
10
+ from configuration_keeper import KeeperConfig
11
+
12
+ from typing import Optional, List, Union
13
+
14
+
15
+ class KeeperTokenizer(PreTrainedTokenizer):
16
+
17
+ config_class = KeeperConfig
18
+
19
+ def __init__(self, cfg=None):
20
+
21
+
22
+ self.tokenizer_retriever = None
23
+ self.tokenizer_model = None
24
+
25
+ if cfg:
26
+ print("Initializing KeeperTokenizer with cfg")
27
+ # Inicialización con configuración
28
+ self.tokenizer_retriever = AutoTokenizer.from_pretrained(cfg.retriever_config['_name_or_path'])
29
+ self.tokenizer_model = AutoTokenizer.from_pretrained(cfg.model_config['_name_or_path'])
30
+
31
+ # Almacena kwargs para la serialización y carga futura
32
+ self.init_kwargs = {'cfg': cfg}
33
+
34
+ super().__init__() # Inicializa la clase base al principio
35
+ print("Initialization complete")
36
+ else:
37
+ # Si cfg no se proporciona, esto se manejará en el método from_pretrained
38
+ print("Initializing KeeperTokenizer without cfg")
39
+
40
+
41
+
42
+ @classmethod
43
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
44
+ # Crea una nueva instancia de KeeperTokenizer sin cfg
45
+ instance = cls()
46
+
47
+ print("Loading tokenizer_retriever from", pretrained_model_name_or_path)
48
+ instance.tokenizer_retriever = AutoTokenizer.from_pretrained(
49
+ pretrained_model_name_or_path, subfolder='tokenizer-retriever'
50
+ )
51
+
52
+ print("Loading tokenizer_model from", pretrained_model_name_or_path)
53
+ instance.tokenizer_model = AutoTokenizer.from_pretrained(
54
+ pretrained_model_name_or_path, subfolder='tokenizer-model'
55
+ )
56
+
57
+ return instance
58
+
59
+ @property
60
+ def vocab_size(self):
61
+ # Obtiene los vocabularios de ambos tokenizadores
62
+ vocab_retriever = self.tokenizer_retriever.get_vocab()
63
+ vocab_model = self.tokenizer_model.get_vocab()
64
+
65
+ # Combina los vocabularios
66
+ combined_vocab = {**vocab_retriever, **vocab_model}
67
+
68
+ # Devuelve el tamaño del vocabulario combinado
69
+ return len(combined_vocab)
70
+
71
+
72
+ def get_vocab(self):
73
+ # Obtiene los vocabularios de ambos tokenizadores
74
+ vocab_retriever = self.tokenizer_retriever.get_vocab()
75
+ vocab_model = self.tokenizer_model.get_vocab()
76
+
77
+ # Organiza los vocabularios en un diccionario con claves separadas
78
+ separated_vocabularies = {
79
+ 'vocab_retriever': vocab_retriever,
80
+ 'vocab_model': vocab_model
81
+ }
82
+
83
+ return separated_vocabularies
84
+
85
+ def _tokenize(self, text, **kwargs):
86
+ # You must implement this method for your tokenization logic
87
+ pass
88
+
89
+ def encode(self, text, **kwargs):
90
+ tokens_retriever = self.tokenizer_retriever(text, return_tensors='pt', **kwargs)
91
+ tokens_model = self.tokenizer_model(text, return_tensors='pt', **kwargs)
92
+
93
+ return {
94
+ 'tokens_retriever': tokens_retriever,
95
+ 'tokens_model': tokens_model
96
+ }
97
+
98
+ def decode(
99
+ self,
100
+ token_ids: Union[int, List[int], "torch.Tensor"],
101
+ skip_special_tokens: bool = False,
102
+ **kwargs,
103
+ ) -> str:
104
+ return self.tokenizer_model.decode(token_ids, skip_special_tokens, **kwargs)
105
+
106
+ def save_vocabulary(self, save_directory, filename_prefix=None):
107
+ # Asegúrate de que el directorio de salida existe
108
+ os.makedirs(save_directory, exist_ok=True)
109
+
110
+ # Guarda el tokenizador retriever
111
+ retriever_save_directory = os.path.join(save_directory, "tokenizer-retriever")
112
+ os.makedirs(retriever_save_directory, exist_ok=True)
113
+ self.tokenizer_retriever.save_pretrained(retriever_save_directory)
114
+
115
+ # Guarda el tokenizador model
116
+ model_save_directory = os.path.join(save_directory, "tokenizer-model")
117
+ os.makedirs(model_save_directory, exist_ok=True)
118
+ self.tokenizer_model.save_pretrained(model_save_directory)
119
+
120
+ # Devuelve los nombres de los archivos guardados (opcional)
121
+ saved_files = [
122
+ "tokenizer-retriver/tokenizer_config.json",
123
+ "tokenizer-retriver/special_tokens_map.json",
124
+ "tokenizer-retriver/vocab.json",
125
+ "tokenizer-retriver/added_tokens.json",
126
+ "tokenizer-model/tokenizer_config.json",
127
+ "tokenizer-model/special_tokens_map.json",
128
+ "tokenizer-model/vocab.json",
129
+ "tokenizer-model/added_tokens.json"
130
+ ]
131
+ return tuple(os.path.join(save_directory, file) for file in saved_files)