JigsawStack
/

moondream2-batched

@@ -951,42 +951,50 @@ class MoondreamModel(nn.Module):
         temperature: float = 0.0,
         top_p: float = 0.0,
     ):
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection.")
-        # 1) Tokenize each label (variable lengths Li)
         rows_ids, lens = [], []
         for lab in labels:
             ids = tpl["prefix"] + self.tokenizer.encode(" " + lab).ids + tpl["suffix"]
             t = torch.tensor(ids, device=self.device, dtype=torch.long)
             rows_ids.append(t)
-            lens.append(t.numel())
         B = len(rows_ids)
         T = max(lens)
-        # 2) Embed and LEFT-pad each row with its first token embedding
-        embs = [text_encoder(t.unsqueeze(0), self.text)[0] for t in rows_ids]  # list[(Li, C)]
         padded = []
         for e, L in zip(embs, lens):
             pad = T - L
             if pad > 0:
-                e = torch.cat([e[:1].repeat(pad, 1), e], dim=0)  # (T, C)
             padded.append(e)
         prompt_emb = torch.stack(padded, dim=0)  # (B, T, C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
-        # 3) Prefill over the shared image prefix [pos : pos+T)
         base = self.attn_mask[:, :, pos : pos + T, :]           # (1,1,T,K)
-        mask = base.expand(B, -1, -1, -1).contiguous()          # (B,1,T,K)
-        pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
-        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)   # (B,T,C)
-        logits_BTV = lm_head(hidden_BTC, self.text)                    # (B,T,V)
-        # *** IMPORTANT: take the tail position for every row ***
-        last_idx = torch.full((B,), T - 1, device=self.device, dtype=torch.long)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), last_idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), last_idx]              # (B,V)
@@ -998,17 +1006,15 @@ class MoondreamModel(nn.Module):
             probs = self._apply_top_p(probs, top_p)
             next_token = torch.multinomial(probs, num_samples=1)   # (B,1)
         pos_end = int(pos + T)
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
         self,
-        hidden,              # (B,1,C)  - last token hidden state per row
-        next_token,          # (B,1)    - unused for greedy loop; kept for API
-        pos,                 # int      - first free position in cache
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
@@ -1020,15 +1026,14 @@ class MoondreamModel(nn.Module):
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
-        # 4D mask: (B,1,1,K); we advance per-row
-        mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
-        p0 = int(pos)
-        if p0 > 0:
-            mask[:, :, :, :p0] = True
-        pos_ids = torch.full((B, 1), p0, device=device, dtype=torch.long)
-        # helper: logits -> normalized [0..1] coordinate (soft-argmax for stability)
         def _argmax01(logits: torch.Tensor) -> torch.Tensor:
             if logits.dim() == 3:
                 logits = logits.squeeze(1)         # (B, bins)
             if use_soft_argmax:
@@ -1043,31 +1048,30 @@ class MoondreamModel(nn.Module):
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
-                alive_idx = alive.nonzero(as_tuple=False).squeeze(1)
-                # ---------- x ----------
-                x_logits = decode_coordinate(hidden, self.region)                # (B,1,bins) or (B,bins)
-                x_center = _argmax01(x_logits)                                   # (B,)
-                x_emb = encode_coordinate(x_center.to(dtype=x_logits.dtype).unsqueeze(-1), self.region).unsqueeze(1)  # (B,1,C)
-                # advance one token for each alive row (per-row column)
-                mask[alive_idx, 0, 0, pos_ids[alive_idx, 0]] = True
-                logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
-                pos_ids[alive_idx, 0] += 1
-                # ---------- y ----------
                 y_logits = decode_coordinate(hidden, self.region)
-                y_center = _argmax01(y_logits)                                   # (B,)
                 y_emb = encode_coordinate(y_center.to(dtype=y_logits.dtype).unsqueeze(-1), self.region).unsqueeze(1)
-                mask[alive_idx, 0, 0, pos_ids[alive_idx, 0]] = True
-                logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
-                pos_ids[alive_idx, 0] += 1
                 if include_size:
-                    # ---------- size (w,h) ----------
-                    size_ret = decode_size(hidden, self.region)                   # (...,2,bins)
-                    w_logits, h_logits = self._norm_size_logits(size_ret, B)      # each (B,bins)
                     if use_soft_argmax:
                         bins = torch.arange(w_logits.size(-1), device=device, dtype=torch.float32)
@@ -1083,8 +1087,7 @@ class MoondreamModel(nn.Module):
                     size_emb = encode_size(torch.stack([w, h], dim=1).to(dtype=w_logits.dtype), self.region).unsqueeze(1)
-                    # write outputs only for alive rows
-                    for i in alive_idx.tolist():
                         xl = (x_center[i] - w[i] / 2).item()
                         xr = (x_center[i] + w[i] / 2).item()
                         yt = (y_center[i] - h[i] / 2).item()
@@ -1096,27 +1099,24 @@ class MoondreamModel(nn.Module):
                             "y_max": max(0.0, min(1.0, yb)),
                         })
-                    mask[alive_idx, 0, 0, pos_ids[alive_idx, 0]] = True
-                    logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
-                    pos_ids[alive_idx, 0] += 1
                     next_tok = logits.argmax(dim=-1)
                     if next_tok.dim() == 3: next_tok = next_tok.squeeze(-1).squeeze(-1)
                     if next_tok.dim() == 2: next_tok = next_tok.squeeze(1)
                 else:
-                    # points only
-                    for i in alive_idx.tolist():
                         out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
-                    mask[alive_idx, 0, 0, pos_ids[alive_idx, 0]] = True
-                    logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
-                    pos_ids[alive_idx, 0] += 1
                     next_tok = logits.argmax(dim=-1)
                     if next_tok.dim() == 3: next_tok = next_tok.squeeze(-1).squeeze(-1)
                     if next_tok.dim() == 2: next_tok = next_tok.squeeze(1)
-                counts[alive] += 1  # we produced one object/point for each alive row
-                # stop rows that hit eos OR reached quota
                 finished_now = (next_tok == eos_id) | (counts >= max_objects)
                 alive &= ~finished_now
@@ -1124,6 +1124,7 @@ class MoondreamModel(nn.Module):
     def detect_multi(self, image, objects, settings=None):
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")

         temperature: float = 0.0,
         top_p: float = 0.0,
     ):
+        """
+        Batch prefill for multiple detection labels.
+        - Right-pads each row with its *last* embedding so the true last token for
+          each row is still at index (len-1). We then take that per-row index.
+        - Advances KV to a common end position (pos + T) for all rows.
+        """
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection.")
+        # Tokenize rows (variable lengths Li)
         rows_ids, lens = [], []
         for lab in labels:
             ids = tpl["prefix"] + self.tokenizer.encode(" " + lab).ids + tpl["suffix"]
             t = torch.tensor(ids, device=self.device, dtype=torch.long)
             rows_ids.append(t)
+            lens.append(int(t.numel()))
         B = len(rows_ids)
         T = max(lens)
+        # Embed, then RIGHT-pad by repeating the last real token embedding
+        embs = [text_encoder(t.unsqueeze(0), self.text)[0] for t in rows_ids]  # (Li, C)
         padded = []
         for e, L in zip(embs, lens):
             pad = T - L
             if pad > 0:
+                e = torch.cat([e, e[-1:].repeat(pad, 1)], dim=0)  # (T, C)
             padded.append(e)
         prompt_emb = torch.stack(padded, dim=0)  # (B, T, C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
+        # Shared mask over the image prefix; broadcast to B
         base = self.attn_mask[:, :, pos : pos + T, :]           # (1,1,T,K)
+        attn_mask = base.expand(B, -1, -1, -1).contiguous()     # (B,1,T,K)
+        pos_ids   = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
+        # Prefill
+        hidden_BTC = self._prefill(prompt_emb, attn_mask, pos_ids, lora)   # (B,T,C)
+        logits_BTV = lm_head(hidden_BTC, self.text)                        # (B,T,V)
+        # For each row, pick its *true* last token (Li-1), not a padded index
+        last_idx = torch.tensor([L - 1 for L in lens], device=self.device, dtype=torch.long)  # (B,)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), last_idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), last_idx]              # (B,V)
             probs = self._apply_top_p(probs, top_p)
             next_token = torch.multinomial(probs, num_samples=1)   # (B,1)
+        # We advanced KV for T steps for everyone; decoding starts after that slot.
         pos_end = int(pos + T)
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
         self,
+        hidden,              # (B,1,C) last token hidden per row
+        next_token,          # (B,1)
+        pos,                 # int: first free KV slot (after prefill)
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
+        # Per-row decoding mask & pos pointer
+        attn = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)  # (B,1,1,K)
+        if pos > 0:
+            attn[:, :, :, :pos] = True
+        pos_ids = torch.full((B, 1), pos, device=device, dtype=torch.long)
         def _argmax01(logits: torch.Tensor) -> torch.Tensor:
+            # returns normalized [0,1] bin position
             if logits.dim() == 3:
                 logits = logits.squeeze(1)         # (B, bins)
             if use_soft_argmax:
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
+                idx = alive.nonzero(as_tuple=False).squeeze(1)
+                # ---- x ----
+                x_logits = decode_coordinate(hidden, self.region)
+                x_center = _argmax01(x_logits)
+                x_emb = encode_coordinate(x_center.to(dtype=x_logits.dtype).unsqueeze(-1), self.region).unsqueeze(1)
+                attn[idx, 0, 0, pos_ids[idx, 0]] = True
+                logits, hidden = self._decode_one_tok(x_emb, attn, pos_ids, lora)
+                pos_ids[idx, 0] += 1
+                # ---- y ----
                 y_logits = decode_coordinate(hidden, self.region)
+                y_center = _argmax01(y_logits)
                 y_emb = encode_coordinate(y_center.to(dtype=y_logits.dtype).unsqueeze(-1), self.region).unsqueeze(1)
+                attn[idx, 0, 0, pos_ids[idx, 0]] = True
+                logits, hidden = self._decode_one_tok(y_emb, attn, pos_ids, lora)
+                pos_ids[idx, 0] += 1
                 if include_size:
+                    # ---- (w,h) ----
+                    size_ret = decode_size(hidden, self.region)    # (...,2,bins)
+                    w_logits, h_logits = self._norm_size_logits(size_ret, B)
                     if use_soft_argmax:
                         bins = torch.arange(w_logits.size(-1), device=device, dtype=torch.float32)
                     size_emb = encode_size(torch.stack([w, h], dim=1).to(dtype=w_logits.dtype), self.region).unsqueeze(1)
+                    for i in idx.tolist():
                         xl = (x_center[i] - w[i] / 2).item()
                         xr = (x_center[i] + w[i] / 2).item()
                         yt = (y_center[i] - h[i] / 2).item()
                             "y_max": max(0.0, min(1.0, yb)),
                         })
+                    attn[idx, 0, 0, pos_ids[idx, 0]] = True
+                    logits, hidden = self._decode_one_tok(size_emb, attn, pos_ids, lora)
+                    pos_ids[idx, 0] += 1
                     next_tok = logits.argmax(dim=-1)
                     if next_tok.dim() == 3: next_tok = next_tok.squeeze(-1).squeeze(-1)
                     if next_tok.dim() == 2: next_tok = next_tok.squeeze(1)
                 else:
+                    for i in idx.tolist():
                         out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
+                    attn[idx, 0, 0, pos_ids[idx, 0]] = True
+                    logits, hidden = self._decode_one_tok(y_emb, attn, pos_ids, lora)
+                    pos_ids[idx, 0] += 1
                     next_tok = logits.argmax(dim=-1)
                     if next_tok.dim() == 3: next_tok = next_tok.squeeze(-1).squeeze(-1)
                     if next_tok.dim() == 2: next_tok = next_tok.squeeze(1)
+                counts[alive] += 1
                 finished_now = (next_tok == eos_id) | (counts >= max_objects)
                 alive &= ~finished_now
     def detect_multi(self, image, objects, settings=None):
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")