Resolve - 196 [rank0]: triton.runtime.autotuner.OutOfResources: out of resource: shared memory, Required: 180224, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.

#33

by moidhassan - opened Sep 30, 2024

←

Files changed (2) hide show

positional_embedding.py CHANGED Viewed

@@ -269,10 +269,10 @@ class RotaryEmbedding(torch.nn.Module):
         return (
             apply_rotary_pos_emb(
                 q, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
-            ),
             apply_rotary_pos_emb(
                 k, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
-            ),
         )
     @classmethod

         return (
             apply_rotary_pos_emb(
                 q, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
+            ).to(q.dtype),
             apply_rotary_pos_emb(
                 k, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
+            ).to(q.dtype),
         )
     @classmethod

triton_flash_blocksparse_attn.py CHANGED Viewed

@@ -1020,7 +1020,7 @@ def blocksparse_flash_attn_padded_fwd(
     BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
     EVEN_D = block_d == head_size,
     num_warps = 1 if q_len == 1 else 4,
-    num_stages = 3
     )
     return out

     BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
     EVEN_D = block_d == head_size,
     num_warps = 1 if q_len == 1 else 4,
+    num_stages = 1 # <---- instead of 3
     )
     return out