Fix bias logic to enable QLoRA finetuning

when using the qlora technique, dt_proj doesn't have a bias attribute, resulting in an AttributeError. This change allows for qlora finetuning with an approximate train loss ~1-ish.

Files changed (1) hide show

modeling_jamba.py +10 -4

modeling_jamba.py CHANGED Viewed

@@ -943,10 +943,16 @@ class JambaMambaMixer(nn.Module):
             # in order to make quantization work. Quantization code replaces `torch.nn.Linear` layers with quantized
             # linear layers, and requires to call the forward pass directly.
             # The original code here was: ```discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)```
-            dt_proj_bias = self.dt_proj.bias
-            self.dt_proj.bias = None
-            discrete_time_step = self.dt_proj(time_step).transpose(1, 2)
-            self.dt_proj.bias = dt_proj_bias
             A = -torch.exp(self.A_log.float())
             # 3.c perform the recurrence y ← SSM(A, B, C)(x)

             # in order to make quantization work. Quantization code replaces `torch.nn.Linear` layers with quantized
             # linear layers, and requires to call the forward pass directly.
             # The original code here was: ```discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)```
+            if hasattr(self.dt_proj, "bias")
+                dt_proj_bias = self.dt_proj.bias
+                self.dt_proj.bias = None
+                discrete_time_step = self.dt_proj(time_step).transpose(1, 2)
+                self.dt_proj.bias = dt_proj_bias
+            else:
+                dt_proj_bias = self.dt_proj.base_layer.bias
+                self.dt_proj.base_layer.bias = None
+                discrete_time_step = self.dt_proj(time_step).transpose(1, 2)
+                self.dt_proj.base_layer.bias = dt_proj_bias
             A = -torch.exp(self.A_log.float())
             # 3.c perform the recurrence y ← SSM(A, B, C)(x)