JigsawStack
/

moondream2-batched

@@ -945,7 +945,7 @@ class MoondreamModel(nn.Module):
     def _prefill_prompt_batched(
         self,
-        labels: List[str],
         pos: int,
         lora=None,
         temperature: float = 0.0,
@@ -955,7 +955,7 @@ class MoondreamModel(nn.Module):
         if tpl is None:
             raise NotImplementedError("Model does not support object detection.")
-        # 1) Build token ids for each label (variable length)
         rows_ids, lens = [], []
         for lab in labels:
             ids = tpl["prefix"] + self.tokenizer.encode(" " + lab).ids + tpl["suffix"]
@@ -966,7 +966,7 @@ class MoondreamModel(nn.Module):
         B = len(rows_ids)
         T = max(lens)
-        # 2) Embed then LEFT-pad each row to length T using the row’s first token embedding
         embs = [text_encoder(t.unsqueeze(0), self.text)[0] for t in rows_ids]  # list[(Li, C)]
         padded = []
         for e, L in zip(embs, lens):
@@ -977,30 +977,33 @@ class MoondreamModel(nn.Module):
         prompt_emb = torch.stack(padded, dim=0)  # (B, T, C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
-        # 3) Prefill over the shared image prefix [pos : pos + T)
-        base = self.attn_mask[:, :, pos:pos + T, :]      # (1, 1, T, K)
-        mask = base.expand(B, -1, -1, -1).contiguous()   # (B, 1, T, K)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
-        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)                # (B, T, C)
-        logits_BTV = lm_head(hidden_BTC, self.text)                                # (B, T, V)
-        # **FIX**: After left-padding, the last real token sits at T-1 for every row.
-        last_idx = torch.full((B,), T - 1, device=self.device, dtype=torch.long)   # (B,)
-        last_hidden = hidden_BTC[torch.arange(B, device=self.device), last_idx][:, None, :]  # (B, 1, C)
-        last_logits = logits_BTV[torch.arange(B, device=self.device), last_idx]            # (B, V)
         if temperature == 0.0:
-            next_token = last_logits.argmax(dim=-1, keepdim=True)  # (B, 1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
-            next_token = torch.multinomial(probs, num_samples=1)   # (B, 1)
         pos_end = int(pos + T)
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)  - last token hidden state per row

     def _prefill_prompt_batched(
         self,
+        labels,
         pos: int,
         lora=None,
         temperature: float = 0.0,
         if tpl is None:
             raise NotImplementedError("Model does not support object detection.")
+        # 1) Tokenize each label (variable lengths Li)
         rows_ids, lens = [], []
         for lab in labels:
             ids = tpl["prefix"] + self.tokenizer.encode(" " + lab).ids + tpl["suffix"]
         B = len(rows_ids)
         T = max(lens)
+        # 2) Embed and LEFT-pad each row with its first token embedding
         embs = [text_encoder(t.unsqueeze(0), self.text)[0] for t in rows_ids]  # list[(Li, C)]
         padded = []
         for e, L in zip(embs, lens):
         prompt_emb = torch.stack(padded, dim=0)  # (B, T, C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
+        # 3) Prefill over the shared image prefix [pos : pos+T)
+        base = self.attn_mask[:, :, pos : pos + T, :]           # (1,1,T,K)
+        mask = base.expand(B, -1, -1, -1).contiguous()          # (B,1,T,K)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
+        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)   # (B,T,C)
+        logits_BTV = lm_head(hidden_BTC, self.text)                    # (B,T,V)
+        # *** IMPORTANT: take the tail position for every row ***
+        last_idx = torch.full((B,), T - 1, device=self.device, dtype=torch.long)
+        last_hidden = hidden_BTC[torch.arange(B, device=self.device), last_idx][:, None, :]  # (B,1,C)
+        last_logits = logits_BTV[torch.arange(B, device=self.device), last_idx]              # (B,V)
         if temperature == 0.0:
+            next_token = last_logits.argmax(dim=-1, keepdim=True)  # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
+            next_token = torch.multinomial(probs, num_samples=1)   # (B,1)
         pos_end = int(pos + T)
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)  - last token hidden state per row