Text Generation
Safetensors
Spanish
Paraguay
Culture
Custom Code
Guaraní
unsloth
enpaiva commited on
Commit
714d682
1 Parent(s): 22dd552

Upload model

Browse files
config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "KeeperModelForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_keeper.KeeperConfig",
7
+ "AutoModel": "tokenizer_keeper.KeeperTokenizer",
8
+ "AutoModelForCausalLM": "model_keeper.KeeperModelForCausalLM"
9
+ },
10
+ "device_map": "auto",
11
+ "model_config": {
12
+ "_name_or_path": "google/gemma-2b-it",
13
+ "architectures": [
14
+ "GemmaForCausalLM"
15
+ ],
16
+ "attention_bias": false,
17
+ "attention_dropout": 0.0,
18
+ "bos_token_id": 2,
19
+ "eos_token_id": 1,
20
+ "head_dim": 256,
21
+ "hidden_act": "gelu",
22
+ "hidden_size": 2048,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 16384,
25
+ "max_position_embeddings": 8192,
26
+ "model_type": "gemma",
27
+ "num_attention_heads": 8,
28
+ "num_hidden_layers": 18,
29
+ "num_key_value_heads": 1,
30
+ "pad_token_id": 0,
31
+ "rms_norm_eps": 1e-06,
32
+ "rope_scaling": null,
33
+ "rope_theta": 10000.0,
34
+ "torch_dtype": "bfloat16",
35
+ "transformers_version": "4.38.0.dev0",
36
+ "use_cache": true,
37
+ "vocab_size": 256000
38
+ },
39
+ "model_type": "keeper",
40
+ "retriever_config": {
41
+ "_name_or_path": "AdrienB134/ColBERTv1.0-bert-based-spanish-mmarcoES",
42
+ "architectures": [
43
+ "HF_ColBERT"
44
+ ],
45
+ "attention_probs_dropout_prob": 0.1,
46
+ "classifier_dropout": null,
47
+ "gradient_checkpointing": false,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout_prob": 0.1,
50
+ "hidden_size": 768,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 3072,
53
+ "layer_norm_eps": 1e-12,
54
+ "max_position_embeddings": 512,
55
+ "model_type": "bert",
56
+ "num_attention_heads": 12,
57
+ "num_hidden_layers": 12,
58
+ "output_past": true,
59
+ "pad_token_id": 1,
60
+ "position_embedding_type": "absolute",
61
+ "torch_dtype": "float32",
62
+ "transformers_version": "4.35.2",
63
+ "type_vocab_size": 2,
64
+ "use_cache": true,
65
+ "vocab_size": 31002
66
+ },
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.39.2"
69
+ }
configuration_keeper.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+ from typing import List
3
+
4
+
5
+ class KeeperConfig(PretrainedConfig):
6
+ model_type = "keeper"
7
+
8
+ def __init__(
9
+ self,
10
+ retriever_config = {
11
+ "_name_or_path": "AdrienB134/ColBERTv1.0-bert-based-spanish-mmarcoES",
12
+ "architectures": [
13
+ "HF_ColBERT"
14
+ ],
15
+ "attention_probs_dropout_prob": 0.1,
16
+ "classifier_dropout": None,
17
+ "gradient_checkpointing": False,
18
+ "hidden_act": "gelu",
19
+ "hidden_dropout_prob": 0.1,
20
+ "hidden_size": 768,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 3072,
23
+ "layer_norm_eps": 1e-12,
24
+ "max_position_embeddings": 512,
25
+ "model_type": "bert",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "output_past": True,
29
+ "pad_token_id": 1,
30
+ "position_embedding_type": "absolute",
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.35.2",
33
+ "type_vocab_size": 2,
34
+ "use_cache": True,
35
+ "vocab_size": 31002
36
+ },
37
+ model_config = {
38
+ "_name_or_path": "google/gemma-2b-it",
39
+ "architectures": [
40
+ "GemmaForCausalLM"
41
+ ],
42
+ "attention_bias": False,
43
+ "attention_dropout": 0.0,
44
+ "bos_token_id": 2,
45
+ "eos_token_id": 1,
46
+ "head_dim": 256,
47
+ "hidden_act": "gelu",
48
+ "hidden_size": 2048,
49
+ "initializer_range": 0.02,
50
+ "intermediate_size": 16384,
51
+ "max_position_embeddings": 8192,
52
+ "model_type": "gemma",
53
+ "num_attention_heads": 8,
54
+ "num_hidden_layers": 18,
55
+ "num_key_value_heads": 1,
56
+ "pad_token_id": 0,
57
+ "rms_norm_eps": 1e-06,
58
+ "rope_scaling": None,
59
+ "rope_theta": 10000.0,
60
+ "torch_dtype": "bfloat16",
61
+ "transformers_version": "4.38.0.dev0",
62
+ "use_cache": True,
63
+ "vocab_size": 256000
64
+ },
65
+ auto_map = {
66
+ "AutoConfig": "configuration_keeper.KeeperConfig",
67
+ "AutoModel": "tokenizer_keeper.KeeperTokenizer",
68
+ "AutoModelForCausalLM": "model_keeper.KeeperModelForCausalLM",
69
+ },
70
+ **kwargs,
71
+ ):
72
+ self.retriever_config = retriever_config
73
+ self.model_config = model_config
74
+ self.device_map = 'auto'
75
+ self.auto_map = auto_map
76
+
77
+ super().__init__(**kwargs)
78
+
79
+ @classmethod
80
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
81
+ model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.39.2"
4
+ }
model_keeper.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from configuration_keeper import KeeperConfig
2
+
3
+ import torch
4
+ from transformers import (
5
+ AutoTokenizer,
6
+ AutoConfig,
7
+ AutoModel,
8
+ PreTrainedModel,
9
+ PretrainedConfig,
10
+ AutoModelForCausalLM,
11
+ BitsAndBytesConfig
12
+ )
13
+
14
+ from typing import Dict
15
+ import torch
16
+ import numpy as np
17
+ from einops import rearrange
18
+
19
+ class KeeperModelForCausalLM(PreTrainedModel):
20
+ """
21
+ ColBERT model from: https://arxiv.org/pdf/2004.12832.pdf
22
+ We use a dot-product instead of cosine per term (slightly better)
23
+ """
24
+ config_class = KeeperConfig
25
+ base_model_prefix = "keeper_model"
26
+
27
+ def __init__(self, cfg, n_cands=8, update_both=False) -> None:
28
+ super().__init__(cfg)
29
+
30
+ self.bert = None
31
+ self.llm = None
32
+
33
+ if cfg:
34
+ print("Initializing KeeperModelForCausalLM from cfg")
35
+ # Inicialización con configuración
36
+
37
+ self.bert = AutoModel.from_pretrained(cfg.retriever_config['_name_or_path'])
38
+
39
+ bnb_config = BitsAndBytesConfig(
40
+ load_in_4bit=True,
41
+ bnb_4bit_quant_type="nf4",
42
+ bnb_4bit_compute_dtype=torch.bfloat16
43
+ )
44
+
45
+ self.llm = AutoModelForCausalLM.from_pretrained(
46
+ cfg.model_config['_name_or_path'],
47
+ device_map=cfg.device_map,
48
+ torch_dtype=torch.bfloat16,
49
+ quantization_config=bnb_config
50
+ )
51
+
52
+ # Almacena kwargs para la serialización y carga futura
53
+ # self.init_kwargs = {'cfg': cfg}
54
+
55
+ print("Initialization complete")
56
+ else:
57
+ # Si cfg no se proporciona, esto se manejará en el método from_pretrained
58
+ print("Initializing KeeperTokenizer without cfg")
59
+
60
+ self.n_cands = n_cands
61
+ self.update_both = update_both
62
+ print(f"Model n_cands: {self.n_cands}")
63
+
64
+ # Inicializar buffers vacíos para document_vecs y document_mask
65
+ self.register_buffer('document_retriever_text', torch.empty(0, dtype=torch.long))
66
+ self.register_buffer('document_retriever_mask', torch.empty(0, dtype=torch.long))
67
+ self.register_buffer('document_retriever_type', torch.empty(0, dtype=torch.long))
68
+ self.register_buffer('document_model_text', torch.empty(0, dtype=torch.long))
69
+ # self.register_buffer('document_model_mask', torch.empty(0, dtype=torch.long))
70
+ # self.register_buffer('document_model_type', torch.empty(0, dtype=torch.long))
71
+ self.register_buffer('prompt_left', torch.empty(0, dtype=torch.long))
72
+ self.register_buffer('prompt_right', torch.empty(0, dtype=torch.long))
73
+ self.register_buffer('respuesta', torch.empty(0, dtype=torch.long))
74
+
75
+ def generate(self, query: Dict[str, torch.LongTensor], k: int = 3, **kwargs):
76
+
77
+ query_retriever = {k: v.to("cuda") for k, v in query['tokens_retriever'].items()}
78
+ query_model = {k: v.to("cuda") for k, v in query['tokens_model'].items()}
79
+
80
+ query_vecs = self.forward_representation(query_retriever)
81
+
82
+ doc_dic = {'input_ids': self.document_retriever_text, 'attention_mask':self.document_retriever_mask, 'token_type_ids': self.document_retriever_type}
83
+
84
+ document_vecs = self.forward_representation(doc_dic, sequence_type="doc")
85
+
86
+ self.score = self.forward_aggregation(query_vecs, query['tokens_model']["attention_mask"], document_vecs, self.document_retriever_mask)
87
+
88
+ k = min(k, self.score.numel())
89
+
90
+ topk_scores, topk_indices = torch.topk(self.score, k)
91
+
92
+ topk_texts = [self.document_model_text[i] for i in topk_indices[0].tolist()]
93
+
94
+ concatenated_texts = torch.cat(topk_texts, dim=0)
95
+
96
+ T = torch.cat((self.prompt_left, concatenated_texts.unsqueeze(0), self.prompt_right, query_model['input_ids'], self.respuesta), dim=1)
97
+
98
+ prompt_length = T.shape[1]
99
+
100
+ outputs = self.llm.generate(input_ids=T, max_new_tokens=256, repetition_penalty=1.15)
101
+
102
+ return outputs[0][prompt_length:].unsqueeze(0)
103
+
104
+ def forward_representation(self,
105
+ tokens,
106
+ max_seq_len = 128,
107
+ sequence_type=None) -> torch.Tensor:
108
+
109
+ if sequence_type == "doc":
110
+ if self.update_both:
111
+ with torch.no_grad():
112
+ vecs = self.bert(**tokens)[0]
113
+ else:
114
+ with torch.no_grad():
115
+ with torch.no_grad():
116
+ vecs = self.bert(**tokens)[0] # assuming a distilbert model here
117
+ else:
118
+ with torch.no_grad():
119
+ vecs = self.bert(**tokens)[0]
120
+ # vecs = self.compressor(vecs)
121
+ return vecs
122
+
123
+ def forward_aggregation(self, query_vecs, query_mask, document_vecs, document_mask):
124
+
125
+ # query_vecs: B x N x D
126
+ # doc_vecs: (B * k) x N x D
127
+
128
+ # Unsqueeze query vector
129
+ _bsz = query_vecs.shape[0]
130
+ n_cands = document_vecs.shape[0] // _bsz
131
+ query_vecs_dup = query_vecs.repeat_interleave(n_cands, dim=0).contiguous()
132
+
133
+ score = torch.bmm(query_vecs_dup, document_vecs.transpose(1, 2))
134
+ exp_mask = document_mask.bool().unsqueeze(1).expand(-1, score.shape[1], -1)
135
+ score[~exp_mask] = - 10000
136
+
137
+ # max pooling over document dimension
138
+ score = score.max(-1).values
139
+ query_mask_dup = query_mask.repeat_interleave(n_cands, dim=0).contiguous()
140
+
141
+ score[~(query_mask_dup.bool())] = 0
142
+ score = rearrange(score.sum(-1), '(b n) -> b n', n=n_cands) # B x k
143
+ return score
144
+
145
+ def prompt(self, left_p = None, right_p = None):
146
+ if left_p is None:
147
+ left_p = """ <bos><start_of_turn>user
148
+ Eres un experto en cultura paraguaya que responde segun el contexto:
149
+ -------------------------------
150
+ """
151
+ if right_p is None:
152
+ right_p = """
153
+ -------------------------------
154
+ - Debes responder solamente en Espanol
155
+ - No utilices conocimientos previos.
156
+ - Responde de forma clara, amable y concisa.
157
+
158
+ Pregunta: """
159
+ return left_p, right_p
160
+
161
+ def save_docs(self, docs: list, tokenizer, max_seq_len=128):
162
+ # Tokenizamos el prompt
163
+ prompt_left, prompt_right = self.prompt()
164
+ prompt_left_output = tokenizer.encode(prompt_left)
165
+ prompt_right_output = tokenizer.encode(prompt_right)
166
+
167
+ # Tokenizamos el documento
168
+ doc_outputs = tokenizer.encode(docs, max_length=max_seq_len, padding='max_length', truncation=True)
169
+
170
+ # Pasamos los tensores a cuda (## optimizar: se guardan tensores que no se utilizaran en la gpu)
171
+ doc_outputs = {k: v.to("cuda") for k, v in doc_outputs.items()}
172
+ prompt_left_output = {k: v.to("cuda") for k, v in prompt_left_output.items()}
173
+ prompt_right_output = {k: v.to("cuda") for k, v in prompt_right_output.items()}
174
+
175
+ # Tokenizamos la Respuesta
176
+ resp = tokenizer.encode("""
177
+ Respuesta: <end_of_turn>
178
+ <start_of_turn>model """)
179
+ resp_model = {k: v.to("cuda") for k, v in resp['tokens_model'].items()}
180
+
181
+ # Actualizar el buffer con los vectores de documentos
182
+ self.document_retriever_text = doc_outputs['tokens_retriever']['input_ids']
183
+ self.document_retriever_mask = doc_outputs['tokens_retriever']['attention_mask']
184
+ self.document_retriever_type = doc_outputs['tokens_retriever']['token_type_ids']
185
+ self.document_model_text = doc_outputs['tokens_model']['input_ids']
186
+ # self.document_model_mask = key_outputs['tokens_model']['attention_mask']
187
+ # self.document_model_type = key_outputs['tokens_model']['token_type_ids']
188
+ self.prompt_left = prompt_left_output['tokens_model']['input_ids']
189
+ self.prompt_right = prompt_right_output['tokens_model']['input_ids']
190
+ self.respuesta = resp_model['input_ids']
191
+
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d501ad81b259efb42f63a500cae47596b99f0ddf1e57e98113e5393590bfa73
3
+ size 2608761466