| | """CodeT5 Vulnerability Detection model |
| | Binary Classication Safe(0) vs Vulnerable(1)""" |
| |
|
| | import torch |
| | import torch.nn as nn |
| | from transformers import T5ForConditionalGeneration, RobertaTokenizer |
| |
|
| | class VulnerabilityCodeT5(nn.Module): |
| | """CodeT5 model for vulnerability detection""" |
| | |
| | def __init__(self, model_name="Salesforce/codet5-base", num_labels=2): |
| | super().__init__() |
| |
|
| | self.encoder_decoder = T5ForConditionalGeneration.from_pretrained(model_name) |
| |
|
| | |
| | hidden_size = self.encoder_decoder.config.d_model |
| |
|
| | |
| | self.classifier = nn.Sequential( |
| | nn.Dropout(0.1), |
| | nn.Linear(hidden_size, hidden_size), |
| | nn.ReLU(), |
| | nn.Dropout(0.1), |
| | nn.Linear(hidden_size, num_labels) |
| | ) |
| |
|
| | self.num_labels = num_labels |
| |
|
| | def forward(self, input_ids, attention_mask, labels=None): |
| | """ |
| | Forward pass |
| | Args: |
| | input_ids : tokenized code [batch_size, seq_len] |
| | attention_mask : attention mask [batch_size, seq_len] |
| | labels: ground truth labels [batch_size] |
| | """ |
| |
|
| | |
| | encoder_outputs = self.encoder_decoder.encoder( |
| | input_ids=input_ids, |
| | attention_mask=attention_mask, |
| | return_dict=True |
| | ) |
| |
|
| | |
| | hidden_state = encoder_outputs.last_hidden_state |
| | pooled_output = hidden_state[:, 0, :] |
| |
|
| | |
| | logits = self.classifier(pooled_output) |
| |
|
| | |
| | loss = None |
| | if labels is not None: |
| | loss_fn = nn.CrossEntropyLoss() |
| | loss = loss_fn(logits, labels) |
| |
|
| | return { |
| | 'loss': loss, |
| | 'logits': logits, |
| | 'hidden_states': hidden_state |
| | } |
| | |
| | def predict(self, input_ids, attention_mask): |
| | """Make Predictions""" |
| | self.eval() |
| | with torch.no_grad(): |
| | outputs = self.forward(input_ids, attention_mask) |
| | probs = torch.softmax(outputs["logits"], dim=1) |
| | predictions = torch.argmax(probs, dim=1) |
| |
|
| | return predictions, probs |
| | |
| | def count_parameters(model): |
| | """Count trainable parameters""" |
| | return sum(p.numel() for p in model.parameters() if p.requires_grad) |