openbmb
/

cpm-bee-5b

@@ -451,7 +451,7 @@ class CpmBeeEncoder(nn.Module):
             hidden_states, attn_weights, current_key_value = layer_outputs
             if output_attentions:
                 all_self_attns += (attn_weights,)
-            if current_key_value is not None:
                 current_key_values = current_key_values + (current_key_value,)
         hidden_states = self.output_layernorm(hidden_states)
@@ -734,6 +734,125 @@ class CpmBeeModel(CpmBeePreTrainedModel):
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
         self,
         input_ids: torch.Tensor,
         input_id_sub: Optional[torch.Tensor] = None,
@@ -1140,6 +1259,127 @@ class CpmBeeForCausalLM(CpmBeePreTrainedModel):
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         input_id_sub: Optional[torch.Tensor] = None,
@@ -1234,7 +1474,7 @@ class CpmBeeForCausalLM(CpmBeePreTrainedModel):
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        model_output = self.cpmbee(
             input_ids,
             input_id_sub,
             position,
@@ -1533,7 +1773,7 @@ class CpmBeeForCausalLM(CpmBeePreTrainedModel):
         # init inference
         model_inputs, input_ids = self.prepare_inputs_for_generation(input_ids, batch_size, **model_kwargs)
         pred_start_index = input_ids.size(-1)
-        outputs = self(
             **model_inputs,
             return_dict=True,
             output_attentions=output_attentions,
@@ -1578,7 +1818,7 @@ class CpmBeeForCausalLM(CpmBeePreTrainedModel):
                 input_ids, batch_size, beam_scorer, **model_kwargs
             )
-            outputs = self(
                 **model_inputs,
                 return_dict=True,
                 output_attentions=output_attentions,

             hidden_states, attn_weights, current_key_value = layer_outputs
             if output_attentions:
                 all_self_attns += (attn_weights,)
+            if current_key_values is not None:
                 current_key_values = current_key_values + (current_key_value,)
         hidden_states = self.output_layernorm(hidden_states)
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
+        self,
+        input_ids: torch.Tensor,
+        input_id_sub: Optional[torch.Tensor] = None,
+        length: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+        sample_ids: Optional[torch.Tensor] = None,
+        num_segments: Optional[torch.Tensor] = None,
+        segment: Optional[torch.Tensor] = None,
+        segment_rel_offset: Optional[torch.Tensor] = None,
+        segment_rel: Optional[torch.Tensor] = None,
+        span: Optional[Dict] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        past_key_values: Optional[List] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        # dummy setting for common tests
+        if input_id_sub is None:
+            dtype, device = input_ids.dtype, input_ids.device
+            batch, seq_length = input_ids.size()
+            segment = torch.where(input_ids != 0, 2, 0).to(dtype=dtype, device=device)
+            context = torch.full((batch, seq_length), 1, dtype=dtype, device=device)
+            position = torch.arange(seq_length, dtype=dtype, device=device).repeat(batch, 1)
+            input_id_sub = torch.full((batch, seq_length), 0, dtype=dtype, device=device)
+            segment_rel_offset = torch.full((batch, seq_length), 0, dtype=dtype, device=device)
+            segment_rel = torch.full((batch, seq_length), 0, dtype=dtype, device=device)
+            num_segments = torch.full((batch, seq_length), 0, dtype=dtype, device=device)
+            sample_ids = torch.zeros_like(input_ids)
+        with torch.no_grad():
+            batch = input_ids.size(0)
+            seqlen = input_ids.size(1)
+            device = input_ids.device
+            # calc segment bucket
+            segment_rel_2d = torch.masked_fill(
+                segment[:, :, None] * num_segments[:, :, None]
+                + segment[:, None, :]
+                + segment_rel_offset[:, :, None],
+                ~(
+                    (sample_ids[:, :, None] == sample_ids[:, None, :])
+                    & (span[:, None, :] == span[:, :, None])
+                ),  # not in the same span or sample
+                0,  # avoid torch.gather overflow
+            ).view(batch, seqlen * seqlen)
+            segment_bucket = torch.gather(
+                input=segment_rel,
+                dim=1,
+                index=segment_rel_2d.long(),
+            ).view(batch, seqlen, seqlen)
+            segment_bucket.masked_fill_(
+                ~(
+                    (sample_ids[:, :, None] == sample_ids[:, None, :])
+                    & (span[:, None, :] == span[:, :, None])
+                ),  # not in the same span or sample
+                1,  # bucket is used for in-context samples
+            )
+            # directional mask
+            directional_mask_2d = torch.arange(seqlen, device=device) <= torch.arange(
+                seqlen, device=device
+            ).view(-1, 1)
+            # sample mask
+            sample_mask_2d = (sample_ids[:, :, None] == 0) | (
+                sample_ids[:, :, None] == sample_ids[:, None, :]
+            )
+            # context mask
+            attention_mask = context[:, None, :] | (
+                context[:, :, None].logical_not() & directional_mask_2d.view(1, seqlen, seqlen)
+            )
+            # span mask
+            attention_mask = (
+                attention_mask & sample_mask_2d & (span[:, None, :] == span[:, :, None])
+            )
+            # length mask
+            mask_1d = (
+                torch.arange(seqlen, device=device)[None, :].repeat(batch, 1) < length[:, None]
+            )
+            attention_mask = (
+                mask_1d.view(batch, seqlen, 1) & mask_1d.view(batch, 1, seqlen) & attention_mask
+            )
+            position = torch.arange(seqlen, device=device).expand(batch, seqlen)
+        hidden_states = self.input_embedding(input_ids, input_id_sub)
+        position_bias = self.position_bias(position, position, segment_bucket)
+        hidden_states, present_key_values, all_hidden_states, all_attentions = self.encoder(
+            hidden_states,
+            attention_mask,
+            position_bias,
+            output_attentions,
+            output_hidden_states,
+            past_key_values=None,
+            use_cache=False
+        )
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, present_key_values, all_hidden_states, all_attentions] if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+    def inference(
         self,
         input_ids: torch.Tensor,
         input_id_sub: Optional[torch.Tensor] = None,
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_id_sub: Optional[torch.Tensor] = None,
+        length: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+        sample_ids: Optional[torch.Tensor] = None,
+        num_segments: Optional[torch.Tensor] = None,
+        segment: Optional[torch.Tensor] = None,
+        segment_rel_offset: Optional[torch.Tensor] = None,
+        segment_rel: Optional[torch.Tensor] = None,
+        span: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        past_key_values: Optional[List] = None,
+        use_cache: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        ext_table_ids: Optional[torch.Tensor] = None,  # (ext_table_size) int32
+        ext_table_sub: Optional[torch.Tensor] = None,  # (ext_table_size) int32
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
+                Indices of input sequence tokens in the vocabulary.
+                Indices can be obtained using [`CPMBeeTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            input_id_sub (`torch.Tensor` of shape `(batch_size, seq_len)`):
+                Subscription of input sequence tokens in the vocabulary.
+                Subscription of normal text will be zero while the special tokens of each group will be the 0, 1, 2,
+                ... <ans_0>, <ans_1>, <ans_2> ... belongs to group <ans>. <mask_0>, <mask_1>, <mask_2> ... belongs to
+                group <mask>.
+            length (`torch.Tensor` of shape `(batch_size)`):
+                The length of sequences in batch.
+            context (`torch.Tensor` of shape `(batch_size, seq_len)`):
+                Whether this token id is context or not. If is context, the value is 1. If not, the value is 0. If a
+                token id is context, it does not need to be predicted.
+            sample_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
+                Give a sample id to every token id. The token ids with same sample ids belongs to the same sample.
+            num_segments (`torch.Tensor` of shape `(batch_size, seq_len)`):
+                Total number of segments in the current input.
+            segment (`torch.Tensor` of shape `(batch_size, seq_len)`):
+                Give a segment id to every token id. The token ids with same segment ids belongs to the same sample.
+                Generally, a string key or value in input data will be a segment. For example, input {"input": "hello,
+                ", "<ans>": ""}, the segments includes: "input", "hello, ", "<ans>" and "".
+            segment_rel_offset (`torch.Tensor` of shape `(batch_size, seq_len)`):
+                The offset of segment rel.
+            segment_rel (`torch.Tensor` of shape `(batch_size, seq_len)`):
+                The segment relevance. A relative implementation of measuring the importance of segments.
+            span (`Dict[str, Union[torch.Tensor, List]]`):
+                Span will record every input_ids shape.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                A dummy arguments for CPMBee. The `past_states` contains pre-computed hidden-states (key and values in
+                the self-attention blocks and in the cross-attention blocks) that can be used (see `past_key_values`
+                input) and other history arguments to speed up sequential decoding.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            ext_table_ids (`torch.Tensor`, *optional*):
+                ext_table ids for embedding projection.
+            ext_table_sub (`torch.Tensor`, *optional*):
+                ext_table subscriptions for embedding projection.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_output = self.cpmbee(
+            input_ids,
+            input_id_sub,
+            length,
+            context,
+            sample_ids,
+            num_segments,
+            segment,
+            segment_rel_offset,
+            segment_rel,
+            span,
+            output_attentions,
+            output_hidden_states,
+            past_key_values,
+            use_cache,
+            return_dict,
+        )
+        hidden_states = model_output.last_hidden_state if return_dict else model_output[0]
+        if ext_table_ids is not None:
+            ext_table = self.cpmbee.input_embedding(ext_table_ids, ext_table_sub)
+        else:
+            ext_table = None
+        logits = self.cpmbee.input_embedding.projection(hidden_states, ext_table)
+        loss = None
+        if labels is not None:
+            loss_func = nn.CrossEntropyLoss()
+            loss = loss_func(logits.view(-1, logits.size(-1)), labels.long().view(-1))
+        if not return_dict:
+            output = (logits,) + model_output[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=model_output.past_key_values,
+            hidden_states=model_output.hidden_states,
+            attentions=model_output.attentions,
+        )
+    def inference(
         self,
         input_ids: Optional[torch.Tensor] = None,
         input_id_sub: Optional[torch.Tensor] = None,
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_output = self.cpmbee.inference(
             input_ids,
             input_id_sub,
             position,
         # init inference
         model_inputs, input_ids = self.prepare_inputs_for_generation(input_ids, batch_size, **model_kwargs)
         pred_start_index = input_ids.size(-1)
+        outputs = self.inference(
             **model_inputs,
             return_dict=True,
             output_attentions=output_attentions,
                 input_ids, batch_size, beam_scorer, **model_kwargs
             )
+            outputs = self.inference(
                 **model_inputs,
                 return_dict=True,
                 output_attentions=output_attentions,