omar-ah
/

vil-tracker

Model card Files Files and versions

xet

Community

omar-ah commited on 9 days ago

Commit

a4d3af5

verified ·

1 Parent(s): f871a5c

Sequence training: pairs→K-frame clips, mLSTM memory carries across frames

Browse files

Files changed (1) hide show

vil_tracker/models/backbone.py +54 -28

vil_tracker/models/backbone.py CHANGED Viewed

@@ -134,19 +134,27 @@ class mLSTMBlockWithTMoE(nn.Module):
 class ViLBackbone(nn.Module):
-    """Vision-LSTM backbone for tracking with integrated FiLM temporal modulation.
-    Concatenates template + search patches into a single sequence,
-    processes through bidirectional mLSTM blocks with FiLM modulation
-    injected between blocks at regular intervals, then separates outputs.
-    Template: 128x128 → 8x8 = 64 tokens
-    Search:   256x256 → 16x16 = 256 tokens
-    Total sequence: 320 tokens
     Bidirectional scanning: even blocks L→R, odd blocks R→L.
-    Last `tmoe_blocks` blocks use TMoE MLP for temporal specialization.
-    FiLM modulation: applied after every `film_interval`-th block.
     """
     def __init__(
         self,
@@ -213,52 +221,70 @@ class ViLBackbone(nn.Module):
     def forward(
         self,
         template: torch.Tensor,
-        search: torch.Tensor,
         temporal_mod_manager=None,
     ) -> tuple:
         """
         Args:
             template: (B, 3, 128, 128) template image
-            search: (B, 3, 256, 256) search region image
             temporal_mod_manager: optional TemporalModulationManager for FiLM
         Returns:
             template_feat: (B, 64, D) template features
-            search_feat: (B, 256, D) search features
         """
         B = template.shape[0]
-        # Patch embed
-        t_tokens = self.patch_embed(template)  # (B, 64, D)
-        s_tokens = self.patch_embed(search)    # (B, 256, D)
-        # Add positional + type embeddings
         t_tokens = t_tokens + self.template_pos + self.template_type
-        s_tokens = s_tokens + self.search_pos + self.search_type
-        # Concatenate: [template | search]
-        tokens = torch.cat([t_tokens, s_tokens], dim=1)  # (B, 320, D)
-        n_template = t_tokens.shape[1]
-        # Process through bidirectional mLSTM blocks with optional FiLM
         for i, block in enumerate(self.blocks):
-            reverse = (i % 2 == 1)  # odd blocks: R→L
             tokens = block(tokens, reverse=reverse)
-            # Apply FiLM temporal modulation between blocks
             if temporal_mod_manager is not None:
                 tokens = temporal_mod_manager.modulate(tokens, i)
         tokens = self.norm(tokens)
-        # Update temporal context after full forward pass
         if temporal_mod_manager is not None:
             temporal_mod_manager.update_temporal_context(tokens)
-        # Split back
-        template_feat = tokens[:, :n_template]
-        search_feat = tokens[:, n_template:]
-        return template_feat, search_feat
     def freeze_shared_experts(self):
         """Freeze shared experts in TMoE blocks for Phase 2 training."""

 class ViLBackbone(nn.Module):
+    """Vision-LSTM backbone for tracking with sequential multi-frame processing.
+    Processes template + K search frames as one long mLSTM sequence:
+        [template_tokens | search_1_tokens | search_2_tokens | ... | search_K_tokens]
+    The mLSTM memory state C carries information across frames:
+    - Template tokens establish the target appearance in memory
+    - Search_1 tokens are processed with template context in memory
+    - Search_2 tokens are processed with template + search_1 context, etc.
+    This is the core advantage over ViT: temporal information accumulates
+    in the recurrent memory state, not through attention over all tokens.
+    Token counts:
+        Template: 128x128 → 8x8 = 64 tokens
+        Each search: 256x256 → 16x16 = 256 tokens
+        K=3 sequence: 64 + 3×256 = 832 tokens
     Bidirectional scanning: even blocks L→R, odd blocks R→L.
+    FiLM modulation: applied between blocks at interval=6.
+    TMoE: last `tmoe_blocks` blocks.
     """
     def __init__(
         self,
     def forward(
         self,
         template: torch.Tensor,
+        searches: torch.Tensor,
         temporal_mod_manager=None,
     ) -> tuple:
         """
+        Process template + K search frames as one mLSTM sequence.
         Args:
             template: (B, 3, 128, 128) template image
+            searches: (B, K, 3, 256, 256) K consecutive search frames
+                      OR (B, 3, 256, 256) single search frame (backward compat)
             temporal_mod_manager: optional TemporalModulationManager for FiLM
         Returns:
             template_feat: (B, 64, D) template features
+            search_feats: (B, K, 256, D) per-frame search features
+                          OR (B, 256, D) if single search frame input
         """
         B = template.shape[0]
+        single_frame = (searches.ndim == 4)  # (B, 3, H, W) vs (B, K, 3, H, W)
+        if single_frame:
+            searches = searches.unsqueeze(1)  # (B, 1, 3, H, W)
+        K = searches.shape[1]
+        # Patch embed template
+        t_tokens = self.patch_embed(template)  # (B, 64, D)
         t_tokens = t_tokens + self.template_pos + self.template_type
+        n_template = t_tokens.shape[1]  # 64
+        # Patch embed all search frames
+        # Reshape (B, K, 3, H, W) → (B*K, 3, H, W) for batch patch embedding
+        s_flat = searches.reshape(B * K, *searches.shape[2:])
+        s_tokens_flat = self.patch_embed(s_flat)  # (B*K, 256, D)
+        s_tokens = s_tokens_flat.reshape(B, K, -1, self.dim)  # (B, K, 256, D)
+        s_tokens = s_tokens + self.search_pos.unsqueeze(1) + self.search_type
+        n_search = s_tokens.shape[2]  # 256
+        # Build full sequence: [template | search_1 | search_2 | ... | search_K]
+        # The mLSTM memory carries information across this entire sequence
+        s_tokens_concat = s_tokens.reshape(B, K * n_search, self.dim)  # (B, K*256, D)
+        tokens = torch.cat([t_tokens, s_tokens_concat], dim=1)  # (B, 64 + K*256, D)
+        # Process through bidirectional mLSTM blocks
         for i, block in enumerate(self.blocks):
+            reverse = (i % 2 == 1)
             tokens = block(tokens, reverse=reverse)
             if temporal_mod_manager is not None:
                 tokens = temporal_mod_manager.modulate(tokens, i)
         tokens = self.norm(tokens)
         if temporal_mod_manager is not None:
             temporal_mod_manager.update_temporal_context(tokens)
+        # Split: template features + per-frame search features
+        template_feat = tokens[:, :n_template]  # (B, 64, D)
+        search_tokens = tokens[:, n_template:]  # (B, K*256, D)
+        search_feats = search_tokens.reshape(B, K, n_search, self.dim)  # (B, K, 256, D)
+        if single_frame:
+            return template_feat, search_feats.squeeze(1)  # (B, 256, D)
+        return template_feat, search_feats
     def freeze_shared_experts(self):
         """Freeze shared experts in TMoE blocks for Phase 2 training."""