Transformers
PyTorch
code
custom_code
Inference Endpoints
Dejiao Z commited on
Commit
e026bc9
1 Parent(s): 67cadad

updated readme

Browse files
.ipynb_checkpoints/config-checkpoint.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "_name_or_path": "codesage/codesage-small-v2",
3
- "architectures": [
4
- "CodeSage"
5
- ],
6
- "auto_map": {
7
- "AutoConfig": "config_codesage.CodeSageConfig",
8
- "AutoTokenizer": "tokenization_codesage.CodeSageTokenizer",
9
- "AutoModel": "modeling_codesage.CodeSageModel",
10
- "AutoModelForMaskedLM": "modeling_codesage.CodeSageForMaskedLM",
11
- "AutoModelForSequenceClassification": "modeling_codesage.CodeSageForSequenceClassification"
12
- },
13
- "activation_function": "gelu_new",
14
- "attention_dropout_prob": 0.1,
15
- "embedding_dropout_prob": 0.1,
16
- "initializer_range": 0.02,
17
- "layer_norm_epsilon": 1e-05,
18
- "hidden_size": 1024,
19
- "num_attention_heads": 8,
20
- "num_hidden_layers": 6,
21
- "intermediate_size": 4096,
22
- "max_position_embeddings": 2048,
23
- "residual_dropout_prob": 0.1,
24
- "vocab_size": 49154
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/config_codesage-checkpoint.py DELETED
@@ -1,52 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding=utf-8
3
- # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4
-
5
- from transformers.configuration_utils import PretrainedConfig
6
-
7
- CODESAGE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
8
- "codesage/codesage-small-v2": "https://huggingface.co/codesage/codesage-small-v2/resolve/main/config.json",
9
- "codesage/codesage-base-v2": "https://huggingface.co/codesage/codesage-base-v2/resolve/main/config.json",
10
- "codesage/codesage-large-v2": "https://huggingface.co/codesage/codesage-large-v2/resolve/main/config.json",
11
- }
12
-
13
-
14
- class CodeSageConfig(PretrainedConfig):
15
- model_type = "codesage"
16
-
17
- def __init__(
18
- self,
19
- vocab_size=50257,
20
- max_position_embeddings=1024,
21
- hidden_size=1024,
22
- num_hidden_layers=24,
23
- num_attention_heads=8,
24
- intermediate_size=4096,
25
- activation_function="gelu_new",
26
- residual_dropout_prob=0.1,
27
- embedding_dropout_prob=0.1,
28
- attention_dropout_prob=0.1,
29
- layer_norm_epsilon=1e-5,
30
- initializer_range=0.02,
31
- position_embedding_type='absolute',
32
- bos_token_id=0,
33
- eos_token_id=0,
34
- pad_token_id=49153,
35
- **kwargs
36
- ):
37
- self.vocab_size = vocab_size
38
- self.max_position_embeddings = max_position_embeddings
39
- self.hidden_size = hidden_size
40
- self.num_hidden_layers = num_hidden_layers
41
- self.num_attention_heads = num_attention_heads
42
- self.intermediate_size = intermediate_size
43
- assert 'gelu' in activation_function
44
- self.activation_function = activation_function
45
- self.residual_dropout_prob = residual_dropout_prob
46
- self.embedding_dropout_prob = embedding_dropout_prob
47
- self.attention_dropout_prob = attention_dropout_prob
48
- self.layer_norm_epsilon = layer_norm_epsilon
49
- self.initializer_range = initializer_range
50
- self.position_embedding_type = position_embedding_type
51
-
52
- super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/modeling_codesage-checkpoint.py DELETED
@@ -1,426 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding=utf-8
3
- # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4
-
5
- import math
6
- import torch
7
- import torch.utils.checkpoint
8
- from torch import nn
9
- from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
10
- from transformers.activations import ACT2FN
11
- from transformers.modeling_utils import Conv1D, PreTrainedModel
12
- from transformers.utils import logging
13
- from .config_codesage import CodeSageConfig
14
- from transformers.modeling_outputs import (
15
- BaseModelOutputWithPooling,
16
- MaskedLMOutput,
17
- SequenceClassifierOutput
18
- )
19
-
20
- logger = logging.get_logger(__name__)
21
-
22
- CODESAGE_PRETRAINED_MODEL_ARCHIVE_LIST = [
23
- "codesage/codesage-small-v2",
24
- "codesage/codesage-base-v2",
25
- "codesage/codesage-large-v2",
26
- # See all CodeSage models at https://huggingface.co/models?filter=codesage
27
- ]
28
-
29
-
30
- class CodeSageAttention(nn.Module):
31
- def __init__(self, config):
32
- super().__init__()
33
-
34
- self.hidden_size = config.hidden_size
35
- self.num_heads = config.num_attention_heads
36
- self.head_dim = config.hidden_size // self.num_heads
37
- if self.head_dim * self.num_heads != config.hidden_size:
38
- raise ValueError(
39
- f"`hidden_size` must be divisible by num_heads "
40
- f"(got `hidden_size`: {config.hidden_size} and `num_heads`: {self.num_heads})."
41
- )
42
-
43
- self.c_attn = Conv1D(3 * self.hidden_size, self.hidden_size)
44
- self.c_proj = Conv1D(self.hidden_size, self.hidden_size)
45
-
46
- self.attention_dropout = nn.Dropout(config.attention_dropout_prob)
47
- self.residual_dropout = nn.Dropout(config.residual_dropout_prob)
48
-
49
- def attn(self, query, key, value, attention_mask=None, head_mask=None):
50
- attn_weights = torch.matmul(query, key.transpose(-1, -2))
51
- attn_weights = attn_weights / math.sqrt(self.head_dim)
52
- if attention_mask is not None:
53
- attn_weights = attn_weights + attention_mask
54
-
55
- attn_weights = nn.Softmax(dim=-1)(attn_weights)
56
- attn_weights = self.attention_dropout(attn_weights)
57
- if head_mask is not None:
58
- attn_weights = attn_weights * head_mask
59
-
60
- attn_output = torch.matmul(attn_weights, value)
61
- return attn_output, attn_weights
62
-
63
- def split_heads(self, tensor, num_heads, attn_head_size):
64
- """
65
- Splits hidden_size dim into attn_head_size and num_heads
66
- """
67
- new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
68
- tensor = tensor.view(*new_shape)
69
- return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
70
-
71
- def merge_heads(self, tensor, num_heads, attn_head_size):
72
- """
73
- Merges attn_head_size dim and num_attn_heads dim into hidden_size
74
- """
75
- tensor = tensor.permute(0, 2, 1, 3).contiguous()
76
- new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
77
- return tensor.view(new_shape)
78
-
79
- def forward(
80
- self,
81
- hidden_states,
82
- attention_mask=None,
83
- head_mask=None,
84
- output_attentions=False,
85
- ):
86
- query, key, value = self.c_attn(hidden_states).split(self.hidden_size, dim=2)
87
- query = self.split_heads(query, self.num_heads, self.head_dim)
88
- key = self.split_heads(key, self.num_heads, self.head_dim)
89
- value = self.split_heads(value, self.num_heads, self.head_dim)
90
-
91
- attn_output, attn_weights = self.attn(query, key, value, attention_mask, head_mask)
92
-
93
- attn_output = self.merge_heads(attn_output, self.num_heads, self.head_dim)
94
- attn_output = self.c_proj(attn_output)
95
- attn_output = self.residual_dropout(attn_output)
96
-
97
- outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
98
- return outputs # a, present, (attentions)
99
-
100
-
101
- class CodeSageMLP(nn.Module):
102
- def __init__(self, intermediate_size, config):
103
- super().__init__()
104
-
105
- self.c_fc = Conv1D(intermediate_size, config.hidden_size)
106
- self.act = ACT2FN[config.activation_function]
107
- self.c_proj = Conv1D(config.hidden_size, intermediate_size)
108
- self.dropout = nn.Dropout(config.residual_dropout_prob)
109
-
110
- def forward(self, hidden_states):
111
- hidden_states = self.c_fc(hidden_states)
112
- hidden_states = self.act(hidden_states)
113
- hidden_states = self.c_proj(hidden_states)
114
- hidden_states = self.dropout(hidden_states)
115
- return hidden_states
116
-
117
-
118
- class CodeSageBlock(nn.Module):
119
- def __init__(self, config):
120
- super().__init__()
121
- hidden_size = config.hidden_size
122
- inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size
123
- self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
124
- self.attn = CodeSageAttention(config)
125
- self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
126
- self.mlp = CodeSageMLP(inner_dim, config)
127
-
128
- def forward(
129
- self,
130
- hidden_states,
131
- attention_mask=None,
132
- head_mask=None,
133
- output_attentions=False,
134
- ):
135
- residual = hidden_states
136
- hidden_states = self.ln_1(hidden_states)
137
- attn_outputs = self.attn(
138
- hidden_states,
139
- attention_mask=attention_mask,
140
- head_mask=head_mask,
141
- output_attentions=output_attentions
142
- )
143
- attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
144
- outputs = attn_outputs[1:]
145
- hidden_states = attn_output + residual
146
-
147
- residual = hidden_states
148
- hidden_states = self.ln_2(hidden_states)
149
- feed_forward_hidden_states = self.mlp(hidden_states)
150
- hidden_states = residual + feed_forward_hidden_states
151
-
152
- outputs = (hidden_states,) + outputs[1:]
153
- return outputs # hidden_states, present, (attentions)
154
-
155
-
156
- class CodeSagePreTrainedModel(PreTrainedModel):
157
- config_class = CodeSageConfig
158
- base_model_prefix = "transformer"
159
-
160
- def _init_weights(self, module):
161
- """Initialize the weights."""
162
- if isinstance(module, (nn.Linear, Conv1D)):
163
- module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
164
- if module.bias is not None:
165
- module.bias.data.zero_()
166
- elif isinstance(module, nn.Embedding):
167
- module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
168
- if module.padding_idx is not None:
169
- module.weight.data[module.padding_idx].zero_()
170
- elif isinstance(module, nn.LayerNorm):
171
- module.bias.data.zero_()
172
- module.weight.data.fill_(1.0)
173
-
174
-
175
- class CodeSageModel(CodeSagePreTrainedModel):
176
- def __init__(self, config):
177
- super().__init__(config)
178
-
179
- self.wte = nn.Embedding(config.vocab_size, config.hidden_size)
180
- self.wpe = nn.Embedding(config.max_position_embeddings, config.hidden_size)
181
-
182
- self.drop = nn.Dropout(config.embedding_dropout_prob)
183
- self.h = nn.ModuleList([CodeSageBlock(config) for _ in range(config.num_hidden_layers)])
184
- self.ln_f = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
185
-
186
- self.init_weights()
187
-
188
- def get_input_embeddings(self):
189
- return self.wte
190
-
191
- def set_input_embeddings(self, new_embeddings: torch.Tensor):
192
- self.wte = new_embeddings
193
-
194
- def forward(
195
- self,
196
- input_ids=None,
197
- attention_mask=None,
198
- position_ids=None,
199
- head_mask=None,
200
- inputs_embeds=None,
201
- output_attentions=None,
202
- output_hidden_states=None,
203
- return_dict=None
204
- ):
205
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
206
- output_hidden_states = (
207
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
208
- )
209
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
210
-
211
- if input_ids is not None and inputs_embeds is not None:
212
- raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
213
- if input_ids is not None:
214
- input_shape = input_ids.size()
215
- elif inputs_embeds is not None:
216
- input_shape = inputs_embeds.size()[:-1]
217
- else:
218
- raise ValueError("You have to specify either input_ids or inputs_embeds")
219
-
220
- device = input_ids.device if input_ids is not None else inputs_embeds.device
221
- if position_ids is None:
222
- position_ids = torch.arange(input_shape[-1], dtype=torch.long, device=device)
223
- position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
224
- else:
225
- position_ids = position_ids.view(-1, input_shape[-1])
226
-
227
- extended_attention_mask = None
228
- if attention_mask is not None:
229
- assert attention_mask.dim() == 2
230
- extended_attention_mask = attention_mask[:, None, None, :]
231
- extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility
232
- extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
233
-
234
- head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
235
- if inputs_embeds is None:
236
- inputs_embeds = self.wte(input_ids)
237
-
238
- position_embeds = self.wpe(position_ids)
239
- hidden_states = inputs_embeds + position_embeds
240
-
241
- hidden_states = self.drop(hidden_states)
242
- output_shape = input_shape + (hidden_states.size(-1),)
243
-
244
- all_self_attentions = () if output_attentions else None
245
- all_hidden_states = () if output_hidden_states else None
246
- for i, block in enumerate(self.h):
247
- if output_hidden_states:
248
- all_hidden_states = all_hidden_states + (hidden_states,)
249
-
250
- outputs = block(
251
- hidden_states,
252
- attention_mask=extended_attention_mask,
253
- head_mask=head_mask[i],
254
- output_attentions=output_attentions,
255
- )
256
-
257
- hidden_states = outputs[0]
258
- if output_attentions:
259
- all_self_attentions = all_self_attentions + (outputs[1],)
260
-
261
- hidden_states = self.ln_f(hidden_states)
262
- hidden_states = hidden_states.view(*output_shape)
263
- if output_hidden_states:
264
- all_hidden_states = all_hidden_states + (hidden_states,)
265
-
266
- pooled_output = None # max-pooled output
267
- if attention_mask is not None:
268
- pooled_output = (hidden_states * attention_mask[:, :, None]).sum(1) / attention_mask.sum(1)[:, None]
269
-
270
- if not return_dict:
271
- return tuple(
272
- v
273
- for v in [hidden_states, pooled_output, all_hidden_states, all_self_attentions]
274
- if v is not None
275
- )
276
-
277
- return BaseModelOutputWithPooling(
278
- last_hidden_state=hidden_states,
279
- pooler_output=pooled_output,
280
- hidden_states=all_hidden_states,
281
- attentions=all_self_attentions
282
- )
283
-
284
-
285
- class CodeSageForMaskedLM(CodeSagePreTrainedModel):
286
- _tied_weights_keys = ["lm_head.weight"]
287
-
288
- def __init__(self, config):
289
- super().__init__(config)
290
- self.transformer = CodeSageModel(config)
291
- self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
292
-
293
- self.init_weights()
294
-
295
- def get_output_embeddings(self):
296
- return self.lm_head
297
-
298
- def set_output_embeddings(self, new_embeddings):
299
- self.lm_head = new_embeddings
300
-
301
- def forward(
302
- self,
303
- input_ids=None,
304
- attention_mask=None,
305
- position_ids=None,
306
- head_mask=None,
307
- inputs_embeds=None,
308
- labels=None,
309
- output_attentions=None,
310
- output_hidden_states=None,
311
- return_dict=None
312
- ):
313
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
314
-
315
- transformer_outputs = self.transformer(
316
- input_ids,
317
- attention_mask=attention_mask,
318
- position_ids=position_ids,
319
- head_mask=head_mask,
320
- inputs_embeds=inputs_embeds,
321
- output_attentions=output_attentions,
322
- output_hidden_states=output_hidden_states,
323
- return_dict=return_dict
324
- )
325
- hidden_states = transformer_outputs[0]
326
- lm_logits = self.lm_head(hidden_states)
327
-
328
- masked_lm_loss = None
329
- if labels is not None:
330
- loss_fct = CrossEntropyLoss()
331
- masked_lm_loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
332
-
333
- if not return_dict:
334
- output = (lm_logits,) + transformer_outputs[1:]
335
- return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
336
-
337
- return MaskedLMOutput(
338
- loss=masked_lm_loss,
339
- logits=lm_logits,
340
- hidden_states=transformer_outputs.hidden_states,
341
- attentions=transformer_outputs.attentions,
342
- )
343
-
344
-
345
- class CodeSageForSequenceClassification(CodeSagePreTrainedModel):
346
-
347
- def __init__(self, config):
348
- super().__init__(config)
349
- self.num_labels = config.num_labels
350
- self.config = config
351
-
352
- self.transformer = CodeSageModel(config)
353
- classifier_dropout = (
354
- config.classifier_dropout
355
- if hasattr(config, 'classifier_dropout') and config.classifier_dropout is not None
356
- else config.residual_dropout_prob
357
- )
358
- self.dropout = nn.Dropout(classifier_dropout)
359
- self.classifier = nn.Linear(config.hidden_size, config.num_labels)
360
-
361
- # Initialize weights and apply final processing
362
- self.post_init()
363
-
364
- def forward(
365
- self,
366
- input_ids=None,
367
- attention_mask=None,
368
- position_ids=None,
369
- head_mask=None,
370
- inputs_embeds=None,
371
- labels=None,
372
- output_attentions=None,
373
- output_hidden_states=None,
374
- return_dict=None,
375
- ):
376
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
377
- assert attention_mask is not None, "attention_mask is needed to perform max-pooling"
378
-
379
- outputs = self.transformer(
380
- input_ids,
381
- attention_mask=attention_mask,
382
- position_ids=position_ids,
383
- head_mask=head_mask,
384
- inputs_embeds=inputs_embeds,
385
- output_attentions=output_attentions,
386
- output_hidden_states=output_hidden_states,
387
- return_dict=return_dict,
388
- )
389
-
390
- pooled_output = outputs[1]
391
- pooled_output = self.dropout(pooled_output)
392
- logits = self.classifier(pooled_output)
393
-
394
- loss = None
395
- if labels is not None:
396
- if self.config.problem_type is None:
397
- if self.num_labels == 1:
398
- self.config.problem_type = "regression"
399
- elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
400
- self.config.problem_type = "single_label_classification"
401
- else:
402
- self.config.problem_type = "multi_label_classification"
403
-
404
- if self.config.problem_type == "regression":
405
- loss_fct = MSELoss()
406
- if self.num_labels == 1:
407
- loss = loss_fct(logits.squeeze(), labels.squeeze())
408
- else:
409
- loss = loss_fct(logits, labels)
410
- elif self.config.problem_type == "single_label_classification":
411
- loss_fct = CrossEntropyLoss()
412
- loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
413
- elif self.config.problem_type == "multi_label_classification":
414
- loss_fct = BCEWithLogitsLoss()
415
- loss = loss_fct(logits, labels)
416
-
417
- if not return_dict:
418
- output = (logits,) + outputs[2:]
419
- return ((loss,) + output) if loss is not None else output
420
-
421
- return SequenceClassifierOutput(
422
- loss=loss,
423
- logits=logits,
424
- hidden_states=outputs.hidden_states,
425
- attentions=outputs.attentions,
426
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/tokenizer_config-checkpoint.json DELETED
@@ -1,34 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "additional_special_tokens": [
4
- "<|endoftext|>",
5
- "<fim_prefix>",
6
- "<fim_middle>",
7
- "<fim_suffix>",
8
- "<fim_pad>",
9
- "<filename>",
10
- "<gh_stars>",
11
- "<issue_start>",
12
- "<issue_comment>",
13
- "<issue_closed>",
14
- "<jupyter_start>",
15
- "<jupyter_text>",
16
- "<jupyter_code>",
17
- "<jupyter_output>",
18
- "<empty_output>",
19
- "<commit_before>",
20
- "<commit_msg>",
21
- "<commit_after>",
22
- "<reponame>"
23
- ],
24
- "bos_token": "<|endoftext|>",
25
- "eos_token": "<|endoftext|>",
26
- "add_eos_token": true,
27
- "model_max_length": 1000000000000000019884624838656,
28
- "unk_token": "<|endoftext|>",
29
- "vocab_size": 49152,
30
- "tokenizer_class": "CodeSageTokenizer",
31
- "auto_map": {
32
- "AutoTokenizer": ["tokenization_codesage.CodeSageTokenizer", null]
33
- }
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,3 +1,90 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - bigcode/the-stack-dedup
5
+ - bigcode/the-stack-v2
6
+
7
+ library_name: transformers
8
+ language:
9
+ - code
10
+ ---
11
+
12
+ ## CodeSage-Base-v2
13
+
14
+ ### Model description
15
+ CodeSage is a family of open code embedding models with an encoder architecture that supports a wide range of source code understanding tasks. It was initially introduced in the paper:
16
+
17
+ [Code Representation Learning At Scale by Dejiao Zhang*, Wasi Uddin Ahmad*, et al.](https://arxiv.org/abs/2402.01935)
18
+
19
+ For this V2 model, we enhanced semantic search performance by improving the quality of the contrastive learning data through [consistency filtering](https://arxiv.org/abs/2209.11755). Starting from the pretrained checkpoint (trained with both Masked Language Modeling (MLM) and deobfuscation [Section 3.1](https://arxiv.org/abs/2402.01935)) from our V1 model training (Zhang et al., 2023), we applied contrastive learning with the filtered data. Unlike the V1 model, we extracted the initial set of (text, code) pairs—specifically, summaries and function/class bodies—from [The Stack V2](https://huggingface.co/datasets/bigcode/the-stack-v2) data instead of using the [V1](https://huggingface.co/datasets/bigcode/the-stack-dedup) data. We employed simple rule-based filtering as detailed in our previous work. We then applied consistency filtering to further refine the data. While using The Stack V2 resulted in minor performance boosts on downstream tasks, the majority of the performance improvements came from the consistency filtering.
20
+
21
+ ### Model Performance
22
+
23
+ #### 1.Code2Code Search
24
+ | Model Name | # Params | Embd Dim | Python | Java | JS | TS | C# | C | Ruby | PhP | GO | AVG |
25
+ |---------------------|----------|----------|--------|-------|-------|--------|--------|--------|--------|--------|--------|--------|
26
+ | OpenAI-Code-01 | NA | 3072 | 21.92 | 8.90 | 4.90 | 5.70 | 3.15 | 11.58 | 26.25 | 16.60 | 9.40 | 12.04 |
27
+ | OpenAI-Ada-002 | NA | 1536 | 35.91 | 25.13 | 19.01 | 21.86 | 10.17 | 29.15 | 40.85 | 40.47 | 23.43 | 27.33 |
28
+ | OpenAI-Text-3-Small | NA | 1536 | 25.18 | 12.61 | 8.00 | 9.44 | 5.46 | 15.86 | 30.70 | 23.33 | 11.20 | 15.57 |
29
+ | OpenAI-Text-3-Large | NA | 3072 | 40.57 | 25.33 | 20.09 | 22.00 | 11.84 | 31.90 | 42.54 | 41.84 | 21.75 | 28.65 |
30
+ | CodeSage-Small | 130M | 1024 | 36.31 | 23.97 | 26.60 | 29.90 | 11.84 | 22.84 | 29.06 | 34.64 | 19.56 | 26.08 |
31
+ | CodeSage-Base | 356M | 1024 | 47.52 | 22.84 | 28.70 | 31.95 | 13.37 | 30.99 | 44.86 | 51.13 | 25.15 | 32.95 |
32
+ | CodeSage-Large | 1.3B | 2048 | 46.70 | 33.13 | 37.16 | 41.18 | 16.81 | 32.89 | 54.12 | 52.13 | 32.48 | 38.51 |
33
+ | CodeSage-v2-Small | 130M | 1024 | 45.60 | 33.65 | 39.96 | 47.78 | 19.19 | 30.55 | 40.12 | 55.39 | 30.96 | 38.13 |
34
+ | CodeSage-v2-Base | 356M | 1024 | 55.86 | 42.89 | 45.29 | 54.58 | 23.90 | 38.52 | 56.02 | 64.56 | 42.88 | 47.17 |
35
+ | CodeSage-v2-Large | 1.3B | 2048 | 61.11 | 47.09 | 51.18 | 60.67 | 28.04 | 43.40 | 60.74 | 67.87 | 43.86 | 51.55 |
36
+
37
+
38
+ #### 2. NL2Code Search
39
+ | Model Name | # Params | CoSQA | AdvTest | Python | Java | JS | PhP | GO | Ruby | Avg |
40
+ |---------------------|----------|-------|---------|--------|-------|-------|--------|--------|--------|--------|
41
+ | OpenAI-Code-01 | NA | 52.20 | 36.03 | 63.13 | 67.85 | 62.30 | 57.47 | 85.22 | 69.28 | 61.69 |
42
+ | OpenAI-Ada-002 | NA | 44.23 | 38.08 | 68.02 | 71.49 | 67.50 | 60.62 | 85.63 | 74.20 | 63.72 |
43
+ | OpenAI-Text-3-Small | NA | 52.48 | 34.10 | 62.62 | 65.87 | 60.28 | 54.85 | 81.96 | 67.57 | 59.97 |
44
+ | OpenAI-Text-3-Large | NA | 55.21 | 46.83 | 70.81 | 72.89 | 68.12 | 59.58 | 87.60 | 75.22 | 67.03 |
45
+ | CodeSage-Small | 130M | 49.93 | 41.05 | 64.26 | 63.19 | 59.87 | 54.65 | 77.60 | 63.18 | 59.22 |
46
+ | CodeSage-Base | 356M | 48.50 | 48.87 | 67.81 | 68.00 | 66.87 | 58.13 | 83.17 | 68.00 | 63.67 |
47
+ | CodeSage-Large | 1.3B | 47.49 | 52.35 | 70.64 | 70.20 | 69.54 | 61.31 | 83.60 | 71.88 | 65.88 |
48
+ | CodeSage-v2-Small | 130M | 52.39 | 47.28 | 68.79 | 68.13 | 65.77 | 60.20 | 80.26 | 72.46 | 64.41 |
49
+ | CodeSage-v2-Base | 356M | 50.74 | 52.00 | 70.46 | 70.89 | 69.61 | 62.81 | 82.37 | 73.71 | 66.57 |
50
+ | CodeSage-v2-Large | 1.3B | 53.18 | 56.31 | 74.18 | 72.33 | 72.49 | 65.26 | 84.67 | 76.61 | 69.38 |
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+ ### Training Data
59
+ This pretrained checkpoint is the same as those used by our V1 model ([codesage/codesage-small](https://huggingface.co/codesage/codesage-small), which is trained on [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup) data. The constative learning data are extracted from [The Stack V2](https://huggingface.co/datasets/bigcode/the-stack-v2). Same as our V1 model, we supported nine languages as follows: c, c-sharp, go, java, javascript, typescript, php, python, ruby.
60
+
61
+ ### How to use
62
+ This checkpoint consists of an encoder (130M model), which can be used to extract code embeddings of 1024 dimension. It can be easily loaded using the AutoModel functionality and employs the [Starcoder Tokenizer](https://arxiv.org/pdf/2305.06161.pdf).
63
+
64
+ ```
65
+ from transformers import AutoModel, AutoTokenizer
66
+
67
+ checkpoint = "codesage/codesage-base-v2"
68
+ device = "cuda" # for GPU usage or "cpu" for CPU usage
69
+
70
+ # Note: CodeSage requires adding eos token at the end of each tokenized sequence
71
+
72
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, add_eos_token=True)
73
+
74
+ model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)
75
+
76
+ inputs = tokenizer.encode("def print_hello_world():\tprint('Hello World!')", return_tensors="pt").to(device)
77
+ embedding = model(inputs)[0]
78
+ ```
79
+
80
+ ### BibTeX entry and citation info
81
+ ```
82
+ @inproceedings{
83
+ zhang2024code,
84
+ title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
85
+ author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
86
+ booktitle={The Twelfth International Conference on Learning Representations},
87
+ year={2024},
88
+ url={https://openreview.net/forum?id=vfzRRjumpX}
89
+ }
90
+ ```