AIRI-Institute
/

gena-lm-bigbird-base-sparse-t2t

@@ -23,8 +23,6 @@ import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
-import numpy as np
 import torch
 import torch.utils.checkpoint
 from packaging import version
@@ -306,7 +304,11 @@ class BertSelfAttention(nn.Module):
             self.rotary_emb = RotaryEmbedding(self.rotary_dim, base=self.rotary_base)
         if self.is_sparse:
-            from deepspeed.ops.sparse_attention import SparseSelfAttention
             self.sparse_self_attention = SparseSelfAttention(self.sparse_config, max_seq_length=self.max_seq_len)
     def transpose_for_scores(self, x):
@@ -1871,126 +1873,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-class APARENTLoss(nn.Module):
-    def __init__(self):
-        super(APARENTLoss, self).__init__()
-    def forward(self, p, y):
-        for i, n in enumerate(y):
-            if n == 0.:
-                y[i] += 1e-3
-            elif n == 1.:
-                y[i] -= 1e-3
-        loss = p * torch.log(p / y) + (1 - p) * torch.log((1 - p) / (1 - y))
-        return loss.mean()
-@add_start_docstrings(
-    """
-    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
-    output) e.g. for GLUE tasks.
-    """,
-    BERT_START_DOCSTRING,
-)
-class BertForAPARENTSequenceRegression(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        self.bert = BertModel(config)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
-        )
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        pos_weight=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if np.all(input_ids[:, -1].detach().cpu().numpy() == np.array([3 for i in range(len(input_ids))])):
-            pass
-        else:
-            print("#########################################NOT ENOUGH TOKENS#######################################")
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        logits = torch.sigmoid(logits)
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss() #APARENTLoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze().float(), labels.squeeze().float()) # if it is not a sparse model then --- labels.squeeze().float(), else --- labels.squeeze().half()
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss(pos_weight=pos_weight)
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
 @add_start_docstrings(
@@ -2174,7 +2056,7 @@ class BertForTokenClassification(BertPreTrainedModel):
                     loss_fct = BCEWithLogitsLoss(reduction='none', pos_weight=pos_weight)
                     loss = loss_fct(logits, labels)
                     loss = loss * labels_mask.unsqueeze(-1)
-                    loss = loss.sum() / labels_mask.sum() if labels_mask.sum() != 0.0 else 0.0
         if not return_dict:
             output = (logits,) + outputs[2:]

 from dataclasses import dataclass
 from typing import Optional, Tuple
 import torch
 import torch.utils.checkpoint
 from packaging import version
             self.rotary_emb = RotaryEmbedding(self.rotary_dim, base=self.rotary_base)
         if self.is_sparse:
+            try:
+                from deepspeed.ops.sparse_attention import SparseSelfAttention
+            except ImportError as e:
+                logger.error(f'DeepSpeed is required for Sparse Ops: {e}')
+                raise
             self.sparse_self_attention = SparseSelfAttention(self.sparse_config, max_seq_length=self.max_seq_len)
     def transpose_for_scores(self, x):
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
 @add_start_docstrings(
                     loss_fct = BCEWithLogitsLoss(reduction='none', pos_weight=pos_weight)
                     loss = loss_fct(logits, labels)
                     loss = loss * labels_mask.unsqueeze(-1)
+                    loss = loss.sum() / labels_mask.sum() if labels_mask.sum() != 0.0 else torch.tensor(0.0, device=logits.device)
         if not return_dict:
             output = (logits,) + outputs[2:]