lukeingawesome
/

llm2vec4cxr

@@ -3,6 +3,8 @@ Custom model class for LLM2Vec4CXR that properly handles latent attention poolin
 """
 from llm2vec.models.bidirectional_llama import LlamaBiModel
 # from llm2vec.pooling import LatentAttentionPooling
 from .pooling_latent import LatentAttentionPooling
 from transformers import AutoTokenizer
@@ -11,46 +13,42 @@ import torch.nn as nn
 import torch.nn.functional as F
-class LLM2Vec4CXRModel(LlamaBiModel):
     """
-    Custom LlamaBiModel that includes latent attention pooling by default.
-    This prevents the warning about unused latent attention weights.
     """
     def __init__(self, config, **kwargs):
         super().__init__(config, **kwargs)
         # Initialize latent attention pooling
         self.latent_attn = LatentAttentionPooling(
             d_model=config.hidden_size,
             num_heads=8,  # Standard for this model size
             num_latents=512  # Standard for LLM2Vec
         )
-        # Move to the same device/dtype as the base model
-        if hasattr(self, 'model') and hasattr(self.model, 'embed_tokens'):
-            device = self.model.embed_tokens.weight.device
-            dtype = self.model.embed_tokens.weight.dtype
-            self.latent_attn = self.latent_attn.to(device=device, dtype=dtype)
     def forward(self, input_ids, attention_mask=None, embed_mask=None, **kwargs):
         """
         Forward pass that properly handles latent attention pooling.
         """
         # Get base model output
-        outputs = super().forward(input_ids, attention_mask=attention_mask, **kwargs)
-        # If we have latent attention pooling, apply it
-        if hasattr(self, 'latent_attn') and self.latent_attn is not None:
-            if embed_mask is not None:
-                # Use embed_mask for instruction-following tasks
-                pooled_output = self.latent_attn(outputs.last_hidden_state, embed_mask)
-            else:
-                # Use attention_mask for simple encoding
-                pooled_output = self.latent_attn(outputs.last_hidden_state, attention_mask)
-            return pooled_output
-        return outputs.last_hidden_state
     # --- Convenience tokenizer (lazy) -------------------------------------
     def _get_tokenizer(self):

 """
 from llm2vec.models.bidirectional_llama import LlamaBiModel
+from transformers import PreTrainedModel
+from transformers.models.llama.configuration_llama import LlamaConfig
 # from llm2vec.pooling import LatentAttentionPooling
 from .pooling_latent import LatentAttentionPooling
 from transformers import AutoTokenizer
 import torch.nn.functional as F
+class LLM2Vec4CXRModel(PreTrainedModel):
     """
+    Wrapper model that includes LlamaBiModel and latent attention pooling.
+    Structure matches the saved checkpoint: self.model + self.latent_attn
     """
+    config_class = LlamaConfig
     def __init__(self, config, **kwargs):
         super().__init__(config, **kwargs)
+        # Wrap the LlamaBiModel
+        self.model = LlamaBiModel(config)
         # Initialize latent attention pooling
         self.latent_attn = LatentAttentionPooling(
             d_model=config.hidden_size,
             num_heads=8,  # Standard for this model size
             num_latents=512  # Standard for LLM2Vec
         )
     def forward(self, input_ids, attention_mask=None, embed_mask=None, **kwargs):
         """
         Forward pass that properly handles latent attention pooling.
         """
         # Get base model output
+        outputs = self.model(input_ids, attention_mask=attention_mask, **kwargs)
+        # Apply latent attention pooling
+        if embed_mask is not None:
+            # Use embed_mask for instruction-following tasks
+            pooled_output = self.latent_attn(outputs.last_hidden_state, embed_mask)
+        else:
+            # Use attention_mask for simple encoding
+            pooled_output = self.latent_attn(outputs.last_hidden_state, attention_mask)
+        return pooled_output
     # --- Convenience tokenizer (lazy) -------------------------------------
     def _get_tokenizer(self):