Add support for greedy decoding

The current implementation of sampling only uses `torch.multinomial` and does not support greedy decoding when temperature is 0.0 / top-k is 0 / top-p is 1.0. This PR adds support for greedy decoding.

Files changed (1) hide show

modeling_llada2_moe.py +8 -0

modeling_llada2_moe.py CHANGED Viewed

@@ -1240,6 +1240,14 @@ class LLaDA2MoeModelLM(LLaDA2MoePreTrainedModel, GenerationMixin):
         orig_shape = logits.shape[:-1]
         vocab_size = logits.shape[-1]
         logits = logits.reshape(-1, vocab_size)
         if temperature > 0 and temperature != 1.0:
             logits = logits / temperature
         logits = self._top_k_logits(logits, top_k)

         orig_shape = logits.shape[:-1]
         vocab_size = logits.shape[-1]
         logits = logits.reshape(-1, vocab_size)
+        # Greedy mode: temperature = 0, no top-k/p
+        if temperature == 0.0 and (top_k in (None, 0)) and (top_p is None or top_p >= 1.0):
+            probs = F.softmax(logits, dim=-1)
+            token = logits.argmax(dim=-1, keepdim=True)
+            token_prob = probs.gather(-1, token)
+            return token.view(*orig_shape), token_prob.view(*orig_shape)
         if temperature > 0 and temperature != 1.0:
             logits = logits / temperature
         logits = self._top_k_logits(logits, top_k)