autoprogrammer
/

dream_rcr

@@ -1,5 +1,3 @@
-# coding=utf-8
-# Copyright 2024 The Dream team, HKUNLP Group and...
 import warnings
 import copy
 from dataclasses import dataclass
@@ -15,66 +13,26 @@ from transformers.utils import ModelOutput, is_torchdynamo_compiling, logging
 logger = logging.get_logger(__name__)
-def top_p_logits(logits, top_p=None):
-    if top_p is None or top_p >= 1:
-        return logits
-    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-    sorted_indices_to_remove = cumulative_probs > top_p
-    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-    sorted_indices_to_remove[..., 0] = 0
-    mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
-    mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
-    logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
-    return logits
-def top_k_logits(logits, top_k=None):
-    if top_k is None:
-        return logits
-    top_k = int(min(top_k, logits.size(-1)))
-    if top_k <= 0:
-        return logits
-    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-    logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
-    return logits
-def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
-    # logits: [N, V]
     if temperature and temperature > 0:
         logits = logits / temperature
     if top_p is not None and top_p < 1:
-        logits = top_p_logits(logits, top_p)
     if top_k is not None:
-        logits = top_k_logits(logits, top_k)
-    probs = torch.softmax(logits, dim=-1)
-    # 采样/贪心
-    if temperature and temperature > 0:
-        try:
-            x0 = dists.Categorical(probs=probs).sample()
-            confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
-        except Exception:
-            confidence, x0 = probs.max(dim=-1)
-    else:
-        confidence, x0 = probs.max(dim=-1)
-    # 置信度定义切换
-    if margin_confidence:
-        sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
-        top1_probs = sorted_probs[:, 0]
-        top2_probs = sorted_probs[:, 1]
-        confidence = top1_probs - top2_probs
-    if neg_entropy:
-        # 负熵（≤0；越接近 0 越“确定”）
-        epsilon = 1e-10
-        log_probs = torch.log(probs + epsilon)
-        confidence = torch.sum(probs * log_probs, dim=-1)
-    return confidence, x0
 @dataclass
@@ -97,11 +55,12 @@ class DreamGenerationConfig(GenerationConfig):
         # diffusion specific params
         self.eps: float = kwargs.pop("eps", 1e-3)
         self.steps: int = kwargs.pop("steps", 512)
-        self.alg: str = kwargs.pop("alg", 'origin')  # 'origin' | 'maskgit_plus' | 'topk_margin' | 'entropy'
         self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
         # RCR
         self.rcr: bool = kwargs.pop("rcr", False)
         self.conf_alg: str = kwargs.pop("conf_alg", 'maskgit_plus')
         # outputs
@@ -143,7 +102,7 @@ class DreamGenerationMixin:
         expand_size: int = 1,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None
-    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
         if expand_size == 1:
             return input_ids, attention_mask
         if input_ids is not None:
@@ -152,91 +111,61 @@ class DreamGenerationMixin:
             attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
         return input_ids, attention_mask
-    # =========================
-    # 历史置信度 RCR（贴近 vanilla）
-    # =========================
-    def _apply_rcr_logic(
         self,
-        x: torch.Tensor,
-        x0: torch.Tensor,
-        conf_now: torch.Tensor,          # [M] 仅 mask 位置的置信度（已为 float32）
-        mask_index: torch.Tensor,        # [B, L] bool
-        fixed_conf: torch.Tensor,        # [B, L] float32（历史 max）
-        ema_conf: torch.Tensor,          # [B, L] float32（EMA）
-        gen_mask: torch.Tensor,          # [B, L] bool（已确认集合）
-        written_step: torch.Tensor,      # [B, L] int32（写入的步骤，-1=未写）
-        init_mask_count: torch.Tensor,   # [B] 初始 mask 数
         mask_token_id: int,
         step: int,
         total_steps: int,
         s: torch.Tensor,
         t: torch.Tensor,
-        ema_beta: float = 0.95            # EMA 平滑系数（越大越稳定）
     ):
         """
-        策略要点（接近 vanilla）：
-          1) 当步确认：沿用 vanilla 配额计算，按 conf_now（负熵/概率差等）选 top-k 写入；
-          2) 历史维护：fixed_conf 取历史 max；ema_conf 做滑动平均，写入步 recorded；
-          3) 超额回遮：若当前已确认数 > 目标累计配额，仅在 gen_mask 内、且不是“本步刚写”的位置，
-             选 EMA 最低的 over 个回遮（轻量、稳定）。
         """
-        device = x.device
         B, L = x.shape
-        # 1) 配额（与 vanilla 一致）
-        avg_mask_now = (mask_index.sum().item() / max(1, mask_index.shape[0]))
-        ratio = (1.0 - (s.item() / t.item())) if step < total_steps - 1 else 1.0
-        number_transfer_tokens = int(avg_mask_now * ratio)
-        # 把当步局部置信度/候选整到全长
-        full_conf_now = torch.full((B, L), -1e9, dtype=torch.float32, device=device)  # 用 -1e9 更稳妥
-        full_x0 = torch.full((B, L), mask_token_id, dtype=torch.long, device=device)
-        full_conf_now[mask_index] = conf_now
-        full_x0[mask_index] = x0
-        # 2) 逐样本选择当步 top-k
         for j in range(B):
-            masked_j = int(mask_index[j].sum().item())
-            k_j = min(number_transfer_tokens, masked_j)
-            if k_j > 0:
-                conf_row = full_conf_now[j]  # float32
-                _, sel_idx = torch.topk(conf_row, k=k_j, largest=True)
-                # 写入
-                x[j, sel_idx] = full_x0[j, sel_idx]
-                gen_mask[j, sel_idx] = True
-                # 历史 max & EMA（仅对当步写入位置更新）
-                fixed_conf[j, sel_idx] = torch.maximum(fixed_conf[j, sel_idx], conf_row[sel_idx])
-                ema_conf[j, sel_idx] = ema_beta * ema_conf[j, sel_idx] + (1 - ema_beta) * conf_row[sel_idx]
-                written_step[j, sel_idx] = step
-            # 3) 目标累计配额（与 vanilla 同口径）
-            init_m = int(init_mask_count[j].item())
-            target_cum = init_m if step >= total_steps - 1 else int(init_m * (1.0 - (s.item() / t.item())))
-            current_gen = int(gen_mask[j].sum().item())
-            over = max(0, current_gen - target_cum)
-            if over > 0:
-                # 只能从“非本步写入”的已确认里回遮，避免抖动
-                gen_idx = torch.where(gen_mask[j])[0]
-                if gen_idx.numel() > 0:
-                    # 排除刚写入的
-                    not_just_written = written_step[j, gen_idx] < step
-                    candidates = gen_idx[not_just_written]
-                    if candidates.numel() > 0:
-                        over = min(over, int(candidates.numel()))
-                        cand_ema = ema_conf[j, candidates]  # float32
-                        _, low_local = torch.topk(cand_ema, k=over, largest=False)
-                        low_global = candidates[low_local]
-                        # 回遮
-                        x[j, low_global] = mask_token_id
-                        gen_mask[j, low_global] = False
-                        # 适度清理 EMA，max 保留帮助后续稳定
-                        ema_conf[j, low_global] = 0.0
-                        written_step[j, low_global] = -1  # 重置写入步
-                        # fixed_conf 不清零，保留历史峰值作为“锚”信息
     def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
         if is_torchdynamo_compiling():
@@ -249,29 +178,24 @@ class DreamGenerationMixin:
                 UserWarning,
             )
         if input_ids_length >= generation_config.max_length:
-            input_ids_string = "input_ids"
             raise ValueError(
-                f"Input length of {input_ids_string} is {input_ids_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. You should consider increasing `max_length` or, better yet,"
-                " setting `max_new_tokens`."
             )
     def _prepare_generated_length(self, generation_config, has_default_max_length, input_ids_length):
         if generation_config.max_new_tokens is not None:
             if not has_default_max_length and generation_config.max_length is not None:
                 logger.warning(
-                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information."
                 )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_length
         elif has_default_max_length:
             if generation_config.max_length == DreamGenerationConfig().max_length:
                 generation_config.max_length = generation_config.max_length + input_ids_length
-                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
-                if max_position_embeddings is not None:
-                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
         return generation_config
     def _prepare_generation_config(self, generation_config: Optional[DreamGenerationConfig], **kwargs: Dict) -> DreamGenerationConfig:
@@ -295,7 +219,7 @@ class DreamGenerationMixin:
         return generation_config
-    def _prepare_special_tokens(self, generation_config: DreamGenerationConfig, device: Optional[Union[torch.device, str]] = None):
         def _tensor_or_none(token, device=None):
             if token is None:
                 return token
@@ -311,7 +235,6 @@ class DreamGenerationMixin:
         if eos_token_tensor is not None and eos_token_tensor.ndim == 0:
             eos_token_tensor = eos_token_tensor.unsqueeze(0)
         if pad_token_tensor is None and eos_token_tensor is not None:
             pad_token_tensor = eos_token_tensor[0]
             logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.")
@@ -327,7 +250,7 @@ class DreamGenerationMixin:
         inputs: Optional[torch.Tensor] = None,
         generation_config: Optional[DreamGenerationConfig] = None,
         **kwargs,
-    ) -> Union[DreamModelOutput, torch.LongTensor]:
         generation_config = self._prepare_generation_config(generation_config, **kwargs)
         generation_tokens_hook_func = kwargs.pop("generation_tokens_hook_func", lambda step, x, logits: x)
         generation_logits_hook_func = kwargs.pop("generation_logits_hook_func", lambda step, x, logits: logits)
@@ -350,9 +273,7 @@ class DreamGenerationMixin:
         if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
             warnings.warn(
-                "You are calling .generate() with the `input_ids` being on a device type different"
-                f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
-                f" is on {self.device.type}. You may experience unexpected behaviors or slower generation.",
                 UserWarning,
             )
         if (
@@ -361,8 +282,7 @@ class DreamGenerationMixin:
             and attention_mask is None
         ):
             warnings.warn(
-                "Padding was detected but no attention mask is passed here. For correct "
-                "generation results, please set `attention_mask` when batch-padding inputs.",
                 UserWarning,
             )
@@ -372,14 +292,13 @@ class DreamGenerationMixin:
             attention_mask=attention_mask,
         )
-        result = self._sample(
             input_ids,
             attention_mask=attention_mask,
             generation_config=generation_config,
             generation_tokens_hook_func=generation_tokens_hook_func,
             generation_logits_hook_func=generation_logits_hook_func,
         )
-        return result
     def _sample(
         self,
@@ -388,7 +307,7 @@ class DreamGenerationMixin:
         generation_config: DreamGenerationConfig,
         generation_tokens_hook_func,
         generation_logits_hook_func
-    ) -> Union[DreamModelOutput, torch.LongTensor]:
         output_history = generation_config.output_history
         return_dict_in_generate = generation_config.return_dict_in_generate
         max_length = generation_config.max_length
@@ -401,10 +320,7 @@ class DreamGenerationMixin:
         top_p = generation_config.top_p
         top_k = generation_config.top_k
-        # RCR
-        rcr = generation_config.rcr
-        conf_alg = generation_config.conf_alg
         histories = [] if (return_dict_in_generate and output_history) else None
         # pad input_ids to max_length
@@ -424,75 +340,60 @@ class DreamGenerationMixin:
         timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
-        # ===== RCR 缓冲初始化（关键：float32，避免 dtype 冲突） =====
         if rcr:
-            init_mask_count = (x == mask_token_id).sum(dim=1)                  # [B]
-            fixed_conf = torch.full(x.shape, -1e9, dtype=torch.float32, device=x.device)  # 历史 max
-            ema_conf   = torch.zeros_like(fixed_conf, dtype=torch.float32)     # EMA
-            gen_mask   = torch.zeros_like(x, dtype=torch.bool)                 # 已确认集合
-            written_step = torch.full(x.shape, -1, dtype=torch.int32, device=x.device)    # 写入步
-        else:
-            init_mask_count = None
-            fixed_conf = None
-            ema_conf = None
-            gen_mask = None
-            written_step = None
         x = generation_tokens_hook_func(None, x, None)
         for i in range(steps):
             mask_index = (x == mask_token_id)
             logits = self(x, attention_mask, tok_idx).logits
             logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
             logits = generation_logits_hook_func(i, x, logits)
-            mask_logits = logits[mask_index]
             t = timesteps[i]
             s = timesteps[i + 1]
-            if alg == 'origin':
-                p_transfer = 1 - s / t if i < steps - 1 else 1
-                x0 = torch.zeros_like(x[mask_index], device=self.device, dtype=torch.long) + mask_token_id
-                transfer_index_t_s = torch.rand(*x0.shape, device=self.device) < p_transfer
-                _, x0[transfer_index_t_s] = sample_tokens(
-                    mask_logits[transfer_index_t_s], temperature=temperature, top_p=top_p, top_k=top_k
-                )
-                x[mask_index] = x0.clone()
-            else:
-                use_alg = conf_alg if rcr else alg
-                if use_alg == 'maskgit_plus':
-                    confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k)
-                elif use_alg == 'topk_margin':
-                    confidence, x0 = sample_tokens(
-                        mask_logits, temperature=temperature, top_p=top_p, top_k=top_k, margin_confidence=True
-                    )
-                elif use_alg == 'entropy':
-                    confidence, x0 = sample_tokens(
-                        mask_logits, temperature=temperature, top_p=top_p, top_k=top_k, neg_entropy=True
-                    )
                 else:
-                    raise RuntimeError(f"Unknown alg/conf_alg: {use_alg}")
-                if rcr:
-                    # —— 贴近 vanilla 的历史置信度 RCR ——
-                    self._apply_rcr_logic(
-                        x=x,
-                        x0=x0,
-                        conf_now=confidence.to(torch.float32),
-                        mask_index=mask_index,
-                        fixed_conf=fixed_conf,
-                        ema_conf=ema_conf,
-                        gen_mask=gen_mask,
-                        written_step=written_step,
-                        init_mask_count=init_mask_count,
-                        mask_token_id=mask_token_id,
-                        step=i,
-                        total_steps=steps,
-                        s=s, t=t,
-                        ema_beta=0.8,
-                    )
-                else:
-                    # —— vanilla：本步 top-k 永久确认 ——
                     avg_mask_now = (mask_index.sum().item() / max(1, mask_index.shape[0]))
                     ratio = (1.0 - (s.item() / t.item())) if i < steps - 1 else 1.0
                     number_transfer_tokens = int(avg_mask_now * ratio)
@@ -512,6 +413,48 @@ class DreamGenerationMixin:
                         row_indices = torch.arange(x.size(0), device=self.device).unsqueeze(1).expand_as(transfer_index)
                         x[row_indices, transfer_index] = x_[row_indices, transfer_index]
             x = generation_tokens_hook_func(i, x, logits)
             if histories is not None:
                 histories.append(x.clone())

 import warnings
 import copy
 from dataclasses import dataclass
 logger = logging.get_logger(__name__)
+def _apply_top_p_k_temp(logits, temperature=0.0, top_p=None, top_k=None):
     if temperature and temperature > 0:
         logits = logits / temperature
     if top_p is not None and top_p < 1:
+        # top-p
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
+        mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
+        logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
     if top_k is not None:
+        # top-k
+        top_k = int(min(top_k, logits.size(-1)))
+        if top_k > 0:
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+    return logits
 @dataclass
         # diffusion specific params
         self.eps: float = kwargs.pop("eps", 1e-3)
         self.steps: int = kwargs.pop("steps", 512)
+        self.alg: str = kwargs.pop("alg", 'origin')  # vanilla 使用
         self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
         # RCR
         self.rcr: bool = kwargs.pop("rcr", False)
+        # 注意：论文版 RCR 会忽略这里的 conf_alg，并统一用“选中 token 概率”做 running max
         self.conf_alg: str = kwargs.pop("conf_alg", 'maskgit_plus')
         # outputs
         expand_size: int = 1,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None
+    ):
         if expand_size == 1:
             return input_ids, attention_mask
         if input_ids is not None:
             attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
         return input_ids, attention_mask
+    # =============== 论文版 RCR：运行最大置信度 + 直接选 n_t 回遮 ===============
+    def _apply_rcr_logic_paper(
         self,
+        x: torch.Tensor,                 # [B, L]
+        rmax_conf: torch.Tensor,         # [B, L], float32, running max of selected-token prob
+        init_mask_bool: torch.Tensor,    # [B, L], 初始生成区域（最开始是 MASK 的位置）
+        init_mask_count: torch.Tensor,   # [B], 初始 MASK 数 M0
         mask_token_id: int,
         step: int,
         total_steps: int,
         s: torch.Tensor,
         t: torch.Tensor,
     ):
         """
+        目标：在“初始生成区域”(init_mask_bool) 内，让“已确认个数”符合 vanilla 的线性进度；
+             但位置选择依据“历史最大置信度 rmax_conf”——每步保留 rmax_conf 高的，回遮 rmax_conf 低的。
+        做法：
+          target_cum = floor(M0 * (1 - s/t))    # 最后一步 = M0
+          在 init_mask_bool[j] 内按 rmax_conf[j] 降序选 target_cum 个 => 保持已确认（不 mask）
+          其余位置设为 mask_token_id
         """
         B, L = x.shape
         for j in range(B):
+            M0 = int(init_mask_count[j].item())
+            if step < total_steps - 1:
+                target_cum = int(M0 * (1.0 - (s.item() / t.item())))
+            else:
+                target_cum = M0
+            # 在初始生成区域内排序
+            region_idx = torch.where(init_mask_bool[j])[0]
+            if region_idx.numel() == 0:
+                continue
+            # rmax_conf 越大越稳，保留前 target_cum 个
+            scores = rmax_conf[j, region_idx]  # float32
+            # 防御：若还没更新过，rmax_conf 初始 0.0，会被优先回遮（符合“历史没自信过”的直觉）
+            target_cum = min(target_cum, int(region_idx.numel()))
+            if target_cum <= 0:
+                # 全部保持 mask
+                x[j, region_idx] = mask_token_id
+                continue
+            _, keep_local = torch.topk(scores, k=target_cum, largest=True)
+            keep_global = region_idx[keep_local]
+            # 其余回遮
+            mask_global = torch.ones_like(region_idx, dtype=torch.bool, device=x.device)
+            mask_global[keep_local] = False
+            remask_idx = region_idx[mask_global]
+            if remask_idx.numel() > 0:
+                x[j, remask_idx] = mask_token_id
+            # keep_global 上保持当前写入的 token，不动
     def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
         if is_torchdynamo_compiling():
                 UserWarning,
             )
         if input_ids_length >= generation_config.max_length:
             raise ValueError(
+                f"Input length is {input_ids_length}, but `max_length` is {generation_config.max_length}. "
+                "Increase `max_length` or set `max_new_tokens`."
             )
     def _prepare_generated_length(self, generation_config, has_default_max_length, input_ids_length):
         if generation_config.max_new_tokens is not None:
             if not has_default_max_length and generation_config.max_length is not None:
                 logger.warning(
+                    f"Both `max_new_tokens` and `max_length` are set. `max_new_tokens` takes precedence."
                 )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_length
         elif has_default_max_length:
             if generation_config.max_length == DreamGenerationConfig().max_length:
                 generation_config.max_length = generation_config.max_length + input_ids_length
+                mpe = getattr(self.config, "max_position_embeddings", None)
+                if mpe is not None:
+                    generation_config.max_length = min(generation_config.max_length, mpe)
         return generation_config
     def _prepare_generation_config(self, generation_config: Optional[DreamGenerationConfig], **kwargs: Dict) -> DreamGenerationConfig:
         return generation_config
+    def _prepare_special_tokens(self, generation_config: DreamGenerationConfig, device=None):
         def _tensor_or_none(token, device=None):
             if token is None:
                 return token
         if eos_token_tensor is not None and eos_token_tensor.ndim == 0:
             eos_token_tensor = eos_token_tensor.unsqueeze(0)
         if pad_token_tensor is None and eos_token_tensor is not None:
             pad_token_tensor = eos_token_tensor[0]
             logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.")
         inputs: Optional[torch.Tensor] = None,
         generation_config: Optional[DreamGenerationConfig] = None,
         **kwargs,
+    ):
         generation_config = self._prepare_generation_config(generation_config, **kwargs)
         generation_tokens_hook_func = kwargs.pop("generation_tokens_hook_func", lambda step, x, logits: x)
         generation_logits_hook_func = kwargs.pop("generation_logits_hook_func", lambda step, x, logits: logits)
         if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
             warnings.warn(
+                "You are calling .generate() with `input_ids` on a different device than the model.",
                 UserWarning,
             )
         if (
             and attention_mask is None
         ):
             warnings.warn(
+                "Padding detected but no attention mask was passed. Set `attention_mask` for correct generation.",
                 UserWarning,
             )
             attention_mask=attention_mask,
         )
+        return self._sample(
             input_ids,
             attention_mask=attention_mask,
             generation_config=generation_config,
             generation_tokens_hook_func=generation_tokens_hook_func,
             generation_logits_hook_func=generation_logits_hook_func,
         )
     def _sample(
         self,
         generation_config: DreamGenerationConfig,
         generation_tokens_hook_func,
         generation_logits_hook_func
+    ):
         output_history = generation_config.output_history
         return_dict_in_generate = generation_config.return_dict_in_generate
         max_length = generation_config.max_length
         top_p = generation_config.top_p
         top_k = generation_config.top_k
+        rcr = generation_config.rcr  # 打开则走论文版 RCR（历史最大 top-1 概率）
         histories = [] if (return_dict_in_generate and output_history) else None
         # pad input_ids to max_length
         timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
         if rcr:
+            # 初始生成区域（prompt 之外扩展出来的那一段）
+            init_mask_bool = (x == mask_token_id)         # [B, L]
+            init_mask_count = init_mask_bool.sum(dim=1)   # [B]
+            # 历史最大“被选 token 概率”（float32）
+            rmax_conf = torch.zeros_like(x, dtype=torch.float32, device=x.device)
+            logger.warning(
+                "[RCR] Using PAPER version: running-max of SELECTED-TOKEN PROB; "
+                "this overrides `conf_alg` (e.g., entropy) for remasking decisions."
+            )
         x = generation_tokens_hook_func(None, x, None)
         for i in range(steps):
             mask_index = (x == mask_token_id)
+            # 前向
             logits = self(x, attention_mask, tok_idx).logits
             logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
             logits = generation_logits_hook_func(i, x, logits)
             t = timesteps[i]
             s = timesteps[i + 1]
+            if not rcr:
+                # ===== vanilla 路径（保持你原来的实现）=====
+                mask_logits = logits[mask_index]
+                if alg == 'origin':
+                    p_transfer = 1 - s / t if i < steps - 1 else 1
+                    x0 = torch.zeros_like(x[mask_index], device=self.device, dtype=torch.long) + mask_token_id
+                    transfer_index_t_s = torch.rand(*x0.shape, device=self.device) < p_transfer
+                    if transfer_index_t_s.any():
+                        logits_sub = mask_logits[transfer_index_t_s]
+                        logits_sub = _apply_top_p_k_temp(logits_sub, temperature, top_p, top_k)
+                        probs_sub = torch.softmax(logits_sub, dim=-1)
+                        try:
+                            x0_sel = dists.Categorical(probs=probs_sub).sample()
+                        except Exception:
+                            x0_sel = probs_sub.argmax(dim=-1)
+                        x0[transfer_index_t_s] = x0_sel
+                    x[mask_index] = x0.clone()
                 else:
+                    # 按你 vanilla 的 top-k / alg_temp 逻辑
+                    mask_logits = _apply_top_p_k_temp(logits[mask_index], temperature, top_p, top_k)
+                    probs = torch.softmax(mask_logits, dim=-1)
+                    if temperature and temperature > 0:
+                        try:
+                            x0 = dists.Categorical(probs=probs).sample()
+                            confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
+                        except Exception:
+                            confidence, x0 = probs.max(dim=-1)
+                    else:
+                        confidence, x0 = probs.max(dim=-1)
                     avg_mask_now = (mask_index.sum().item() / max(1, mask_index.shape[0]))
                     ratio = (1.0 - (s.item() / t.item())) if i < steps - 1 else 1.0
                     number_transfer_tokens = int(avg_mask_now * ratio)
                         row_indices = torch.arange(x.size(0), device=self.device).unsqueeze(1).expand_as(transfer_index)
                         x[row_indices, transfer_index] = x_[row_indices, transfer_index]
+            else:
+                # ===== 论文版 RCR =====
+                # 1) 仅对当前 mask 的位置，做 top_p/top_k/temperature 过滤后采样（或贪心）
+                mask_logits = logits[mask_index]
+                mask_logits = _apply_top_p_k_temp(mask_logits, temperature, top_p, top_k)
+                probs = torch.softmax(mask_logits, dim=-1)
+                # 采样 / 贪心
+                if temperature and temperature > 0:
+                    try:
+                        x0 = dists.Categorical(probs=probs).sample()
+                    except Exception:
+                        x0 = probs.argmax(dim=-1)
+                else:
+                    x0 = probs.argmax(dim=-1)
+                # 被选 token 的概率 p_sel（论文要求用这个做“历史置信度”）
+                p_sel = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)  # [M], float32
+                # 写入选中的 token
+                x_maskwrite = torch.full_like(x, mask_token_id, dtype=torch.long)
+                x_maskwrite[mask_index] = x0
+                x = torch.where(mask_index, x_maskwrite, x)
+                # 更新 running-max 置信度（float32）
+                # 先铺到全长
+                full_p_sel = torch.zeros_like(x, dtype=torch.float32)
+                full_p_sel[mask_index] = p_sel.to(torch.float32)
+                rmax_conf = torch.maximum(rmax_conf, full_p_sel)
+                # 2) 基于 rmax_conf 直接确定“下一步要保留的已确认个数”，其余全部回遮
+                self._apply_rcr_logic_paper(
+                    x=x,
+                    rmax_conf=rmax_conf,
+                    init_mask_bool=init_mask_bool,
+                    init_mask_count=init_mask_count,
+                    mask_token_id=mask_token_id,
+                    step=i,
+                    total_steps=steps,
+                    s=s, t=t,
+                )
             x = generation_tokens_hook_func(i, x, logits)
             if histories is not None:
                 histories.append(x.clone())