maomaocun
/

dLLM-Var-no-template

@@ -1181,7 +1181,8 @@ class LLaDAModel(nn.Module):
         attention_bias: Optional[torch.Tensor] = None,
         past_key_values: Optional[Sequence[Tuple[torch.Tensor, torch.Tensor]]] = None,
         use_cache: bool = False,
-        last_logits_only: bool = False,
         output_hidden_states: Optional[bool] = None,
     ) -> LLaDAOutput:
         """
@@ -1351,10 +1352,9 @@ class LLaDAModel(nn.Module):
                     assert cache is not None
                     attn_key_values.extend(cache)
-        if last_logits_only:
-            # shape: (batch_size, 1, d_model)
-            x = x[:, -1, :].unsqueeze(1)
         # Apply final layer norm.
         # shape: (batch_size, seq_len or 1, d_model)
         x = self.transformer.ln_f(x)  # type: ignore
@@ -1406,6 +1406,7 @@ class LLaDAModelLM(PreTrainedModel):
             self.model = LLaDAModel(model_config, init_params=init_params)
         else:
             self.model = model
     def forward(
         self,
@@ -1419,7 +1420,8 @@ class LLaDAModelLM(PreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        cache_position: Optional[Cache] = None,  # This is a hack mitigation of an issue in transformers `4.39.x`
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         if use_cache is None:
             use_cache = self.config.use_cache
@@ -1438,6 +1440,8 @@ class LLaDAModelLM(PreTrainedModel):
             past_key_values=past_key_values,
             use_cache=use_cache,
             output_hidden_states=output_hidden_states,
         )
         logits = outputs.logits
@@ -1457,31 +1461,6 @@ class LLaDAModelLM(PreTrainedModel):
             hidden_states=hidden_states,
         )
-    def can_generate(self) -> bool:
-        return True
-    def prepare_inputs_for_generation(
-        self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
-    ):
-        if past_key_values:
-            # This is because we want the model to only process the last generated token.
-            input_ids = input_ids[:, -1:]
-        model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values}
-        model_inputs.update(kwargs)
-        model_inputs["use_cache"] = kwargs.pop("use_cache", self.config.use_cache)
-        return model_inputs
-    # TODO: these are required to make the implementation complete.
-    # def resize_position_embeddings(self, new_num_position_embeddings: int):
-    #     pass
-    #
-    # def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
-    #     pass
-    #
-    # def _reorder_cache(self, past_key_values, beam_idx):
-    #     pass
     def get_input_embeddings(self) -> torch.nn.Module:
         return self.model.transformer.wte
@@ -1504,5 +1483,76 @@ class LLaDAModelLM(PreTrainedModel):
         if self.config.weight_tying:
             self.model.transformer.ff_out = self.model.transformer.wte
 # Register the model so that it is available for transformer pipelines, auto-loading, etc.
 AutoModel.register(LLaDAConfig, LLaDAModelLM)

         attention_bias: Optional[torch.Tensor] = None,
         past_key_values: Optional[Sequence[Tuple[torch.Tensor, torch.Tensor]]] = None,
         use_cache: bool = False,
+        last_block_logits_only: bool = False,
+        block_length: int = 64,
         output_hidden_states: Optional[bool] = None,
     ) -> LLaDAOutput:
         """
                     assert cache is not None
                     attn_key_values.extend(cache)
+        if last_block_logits_only:
+            # shape: (batch_size, block_length, d_model)
+            x = x[:, -block_length:, :]
         # Apply final layer norm.
         # shape: (batch_size, seq_len or 1, d_model)
         x = self.transformer.ln_f(x)  # type: ignore
             self.model = LLaDAModel(model_config, init_params=init_params)
         else:
             self.model = model
+        self.mask_id = model_config.mask_token_id
     def forward(
         self,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        last_block_logits_only: bool = False,
+        block_length: int = 64,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         if use_cache is None:
             use_cache = self.config.use_cache
             past_key_values=past_key_values,
             use_cache=use_cache,
             output_hidden_states=output_hidden_states,
+            last_block_logits_only=last_block_logits_only,
+            block_length=block_length,
         )
         logits = outputs.logits
             hidden_states=hidden_states,
         )
     def get_input_embeddings(self) -> torch.nn.Module:
         return self.model.transformer.wte
         if self.config.weight_tying:
             self.model.transformer.ff_out = self.model.transformer.wte
+    def prefill_phase(self, input_ids, block_length):
+        """Prefill phase: Process initial prompt and generate KV cache."""
+        with torch.no_grad():
+            outputs = self(
+                input_ids=input_ids,
+                use_cache=True,
+                return_dict=True,
+                last_block_logits_only=True,
+                block_length=block_length
+            )
+        output_past_key_values = []
+        for i in range(len(outputs.past_key_values)):
+            k,v = outputs.past_key_values[i]
+            new_k,new_v = k[:,:,:-block_length,:],v[:,:,:-block_length,:]
+            output_past_key_values.append((new_k,new_v))
+        output_past_key_values = tuple(output_past_key_values)
+        return {
+            'input_ids': input_ids,
+            'logits': outputs.logits,
+            'past_key_values': output_past_key_values,
+        }
+    def unmask_function_greedy(self, logits, x, threshold=0.9):
+        """Greedy unmasking function with confidence threshold."""
+        mask_index = x == self.mask_id
+        x_top_0 = torch.argmax(logits, dim=-1)
+        p = F.softmax(logits, dim=-1)
+        confidence = torch.squeeze(torch.gather(p, dim=-1, index=torch.unsqueeze(x_top_0, -1)), -1)
+        transfer_index = torch.zeros_like(x_top_0, dtype=torch.bool, device=x_top_0.device)
+        confidence = torch.where(mask_index, confidence, -torch.inf)
+        for j in range(confidence.shape[0]):
+            mask = confidence[j] > threshold
+            if mask.sum() == 0:
+                max_conf_idx = torch.argmax(confidence[j])
+                mask[max_conf_idx] = True
+            transfer_index[j] = mask
+        x[transfer_index] = x_top_0[transfer_index]
+        return x
+    @torch.no_grad()
+    def generate(self, input_ids, attention_mask, max_gen_length=1024, block_length=64, threshold=0.9,streaming=False,eos_token_id=126081):
+        batchsize, prompt_length = input_ids.shape
+        max_num_blocks = max_gen_length // block_length
+        output_ids = input_ids
+        block_x = torch.full((batchsize, block_length), self.mask_id, dtype=torch.long).to(self.device)
+        output_ids = torch.cat([output_ids, block_x], dim=-1)
+        # prefilling block loop
+        prefill_outputs = self.prefill_phase(output_ids, block_length)
+        past_key_values = prefill_outputs['past_key_values']
+        logits = prefill_outputs['logits']
+        output_ids[:,-block_length:] = self.unmask_function_greedy(logits=logits, x=output_ids[:,-block_length:], threshold=threshold)
+        # decoding block loop
+        for j in range(max_num_blocks):
+            while (output_ids[:,-block_length:] == self.mask_id).sum():
+                outputs = self(
+                    input_ids=output_ids[:,-block_length:],
+                    past_key_values=past_key_values,
+                    use_cache=True,
+                    return_dict=True
+                )
+                output_ids[:,-block_length:] = self.unmask_function_greedy(logits=outputs.logits, x=output_ids[:,-block_length:], threshold=threshold)
+            past_key_values = outputs.past_key_values
+            if streaming:
+                yield output_ids[:,-block_length:]
+            if (output_ids == eos_token_id).any():
+                return output_ids[:, prompt_length:]
+            block_x = torch.full((batchsize, block_length), self.mask_id, dtype=torch.long).to(self.device)
+            output_ids = torch.cat([output_ids, block_x], dim=-1)
+        return output_ids[:, prompt_length:]
 # Register the model so that it is available for transformer pipelines, auto-loading, etc.
 AutoModel.register(LLaDAConfig, LLaDAModelLM)