Lekr0 commited on 22 days ago

Commit

e686d7b

verified ·

1 Parent(s): d1770f6

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

progress/github/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/__grp__triton_red_fused_mul_0.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.cubin +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.llir +179 -0
progress/github/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.source +230 -0
progress/github/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.ttir +163 -0
progress/github/SpecForge/cache/compiled_kernels/triton/0/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.llir +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/0/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ptx +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/0/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.source +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/0/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ttgir +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/0/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ttir +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/__grp__triton_tem_fused_0.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir +936 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir +780 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/__grp__triton_poi_fused_mul_1.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir +89 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx +357 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source +130 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir +105 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir +104 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/__grp__triton_red_fused_mul_0.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.cubin +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.llir +140 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.ptx +396 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.source +218 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.ttgir +158 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.ttir +155 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/__grp__triton_tem_fused_0.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ptx +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.source +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttgir +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttir +896 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/__grp__triton_poi_fused_mul_1.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.cubin +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.llir +58 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.ptx +221 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.source +51 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.ttgir +42 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.ttir +41 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/__grp__triton_red_fused_mul_0.json +1 -0
progress/github/SpecForge/cache/compiled_kernels/triton/1/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.cubin +0 -0

progress/github/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/__grp__triton_red_fused_mul_0.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"child_paths": {"triton_red_fused_mul_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.source", "triton_red_fused_mul_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.ttir", "triton_red_fused_mul_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.ttgir", "triton_red_fused_mul_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.llir", "triton_red_fused_mul_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.ptx", "triton_red_fused_mul_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.cubin", "triton_red_fused_mul_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.json"}}

progress/github/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.cubin ADDED Viewed

Binary file (20.6 kB). View file

progress/github/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.llir ADDED Viewed

	@@ -0,0 +1,179 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %10 = shl i32 %9, 6, !dbg !8
+  %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %12 = and i32 %11, 126, !dbg !9
+  %13 = lshr exact i32 %12, 1, !dbg !9
+  %14 = or disjoint i32 %13, %10, !dbg !10
+  %15 = icmp slt i32 %14, 47616, !dbg !11
+  %16 = shl nuw nsw i32 %11, 2, !dbg !12
+  %17 = and i32 %16, 4, !dbg !12
+  %18 = sdiv i32 %14, 1488, !dbg !13
+  %19 = shl i32 %14, 7, !dbg !14
+  %20 = shl i32 %14, 12
+  %21 = mul i32 %18, -6094720
+  %22 = add i32 %21, %20
+  %23 = zext nneg i32 %17 to i64, !dbg !15
+  %24 = sext i32 %19 to i64, !dbg !15
+  %invariant.gep = getelementptr bfloat, ptr addrspace(1) %1, i64 %24, !dbg !15
+  br label %25, !dbg !15
+25:                                               ; preds = %8, %25
+  %indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %25 ]
+  %26 = phi float [ 0.000000e+00, %8 ], [ %71, %25 ]
+  %27 = phi float [ 0.000000e+00, %8 ], [ %72, %25 ]
+  %28 = phi float [ 0.000000e+00, %8 ], [ %73, %25 ]
+  %29 = phi float [ 0.000000e+00, %8 ], [ %74, %25 ]
+  %30 = or disjoint i64 %indvars.iv, %23, !dbg !16
+  %31 = trunc nuw nsw i64 %30 to i32, !dbg !17
+  %32 = add i32 %22, %31, !dbg !17
+  %33 = sext i32 %32 to i64, !dbg !18
+  %34 = getelementptr bfloat, ptr addrspace(1) %0, i64 %33, !dbg !18
+  %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
+  %36 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %34, i64 %35, i1 %15) #4, !dbg !19
+  %37 = extractvalue { i32, i32 } %36, 0, !dbg !19
+  %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !19
+  %39 = extractvalue { i32, i32 } %36, 1, !dbg !19
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !19
+  %41 = extractelement <2 x bfloat> %38, i64 0, !dbg !19
+  %42 = extractelement <2 x bfloat> %38, i64 1, !dbg !19
+  %43 = extractelement <2 x bfloat> %40, i64 0, !dbg !19
+  %44 = extractelement <2 x bfloat> %40, i64 1, !dbg !19
+  %45 = fpext bfloat %41 to float, !dbg !20
+  %46 = fpext bfloat %42 to float, !dbg !20
+  %47 = fpext bfloat %43 to float, !dbg !20
+  %48 = fpext bfloat %44 to float, !dbg !20
+  %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %30, !dbg !21
+  %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !22
+  %50 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %gep, i64 %49, i1 %15) #4, !dbg !22
+  %51 = extractvalue { i32, i32 } %50, 0, !dbg !22
+  %52 = bitcast i32 %51 to <2 x bfloat>, !dbg !22
+  %53 = extractvalue { i32, i32 } %50, 1, !dbg !22
+  %54 = bitcast i32 %53 to <2 x bfloat>, !dbg !22
+  %55 = extractelement <2 x bfloat> %52, i64 0, !dbg !22
+  %56 = extractelement <2 x bfloat> %52, i64 1, !dbg !22
+  %57 = extractelement <2 x bfloat> %54, i64 0, !dbg !22
+  %58 = extractelement <2 x bfloat> %54, i64 1, !dbg !22
+  %59 = fpext bfloat %55 to float, !dbg !23
+  %60 = fpext bfloat %56 to float, !dbg !23
+  %61 = fpext bfloat %57 to float, !dbg !23
+  %62 = fpext bfloat %58 to float, !dbg !23
+  %63 = fmul float %45, %59, !dbg !24
+  %64 = fmul float %46, %60, !dbg !24
+  %65 = fmul float %47, %61, !dbg !24
+  %66 = fmul float %48, %62, !dbg !24
+  %67 = fadd float %26, %63, !dbg !25
+  %68 = fadd float %27, %64, !dbg !25
+  %69 = fadd float %28, %65, !dbg !25
+  %70 = fadd float %29, %66, !dbg !25
+  %71 = select i1 %15, float %67, float %26, !dbg !26
+  %72 = select i1 %15, float %68, float %27, !dbg !26
+  %73 = select i1 %15, float %69, float %28, !dbg !26
+  %74 = select i1 %15, float %70, float %29, !dbg !26
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8, !dbg !15
+  %75 = icmp samesign ult i64 %indvars.iv, 120, !dbg !15
+  br i1 %75, label %25, label %76, !dbg !15
+76:                                               ; preds = %25
+  %77 = and i32 %11, 63, !dbg !9
+  %78 = or disjoint i32 %10, %77, !dbg !10
+  %79 = icmp slt i32 %78, 47616, !dbg !11
+  %80 = fadd float %71, %72, !dbg !27
+  %81 = fadd float %73, %80, !dbg !27
+  %82 = fadd float %74, %81, !dbg !27
+  %83 = bitcast float %82 to i32, !dbg !31
+  %84 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %83, i32 1, i32 31), !dbg !31
+  %85 = bitcast i32 %84 to float, !dbg !31
+  %86 = fadd float %82, %85, !dbg !27
+  %87 = shl nuw nsw i32 %12, 1, !dbg !32
+  %88 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %87, !dbg !32
+  store float %86, ptr addrspace(3) %88, align 4, !dbg !32
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !32
+  %89 = shl nuw nsw i32 %77, 2, !dbg !32
+  %90 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %89, !dbg !32
+  %91 = load float, ptr addrspace(3) %90, align 4, !dbg !32
+  %92 = sext i32 %78 to i64, !dbg !33
+  %93 = getelementptr float, ptr addrspace(1) %2, i64 %92, !dbg !33
+  %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !34
+  %95 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %93, i64 %94, i1 %79) #4, !dbg !34
+  %96 = bitcast i32 %95 to float, !dbg !34
+  %97 = fmul float %96, 0x3FE62E4300000000, !dbg !35
+  %98 = fmul float %97, 0x3FF7154760000000, !dbg !36
+  %99 = fsub float %91, %98, !dbg !32
+  %100 = getelementptr float, ptr addrspace(1) %3, i64 %92, !dbg !37
+  %101 = and i32 %11, 64, !dbg !38
+  %102 = icmp eq i32 %101, 0, !dbg !38
+  %103 = bitcast float %99 to i32, !dbg !38
+  %104 = and i1 %102, %79, !dbg !38
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %103, ptr addrspace(1) %100, i1 %104) #4, !dbg !38
+  ret void, !dbg !39
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused_mul_0", linkageName: "triton_red_fused_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 25, column: 21, scope: !4)
+!12 = !DILocation(line: 26, column: 37, scope: !4)
+!13 = !DILocation(line: 29, column: 19, scope: !4)
+!14 = !DILocation(line: 39, column: 45, scope: !4)
+!15 = !DILocation(line: 32, column: 40, scope: !4)
+!16 = !DILocation(line: 33, column: 31, scope: !4)
+!17 = !DILocation(line: 38, column: 50, scope: !4)
+!18 = !DILocation(line: 38, column: 34, scope: !4)
+!19 = !DILocation(line: 38, column: 60, scope: !4)
+!20 = !DILocation(line: 38, column: 122, scope: !4)
+!21 = !DILocation(line: 39, column: 34, scope: !4)
+!22 = !DILocation(line: 39, column: 50, scope: !4)
+!23 = !DILocation(line: 39, column: 112, scope: !4)
+!24 = !DILocation(line: 40, column: 22, scope: !4)
+!25 = !DILocation(line: 42, column: 23, scope: !4)
+!26 = !DILocation(line: 43, column: 48, scope: !4)
+!27 = !DILocation(line: 261, column: 15, scope: !28, inlinedAt: !30)
+!28 = distinct !DILexicalBlockFile(scope: !4, file: !29, discriminator: 0)
+!29 = !DIFile(filename: "standard.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language")
+!30 = !DILocation(line: 44, column: 25, scope: !4)
+!31 = !DILocation(line: 291, column: 36, scope: !28, inlinedAt: !30)
+!32 = !DILocation(line: 51, column: 19, scope: !4)
+!33 = !DILocation(line: 45, column: 30, scope: !4)
+!34 = !DILocation(line: 45, column: 35, scope: !4)
+!35 = !DILocation(line: 48, column: 18, scope: !4)
+!36 = !DILocation(line: 50, column: 19, scope: !4)
+!37 = !DILocation(line: 52, column: 25, scope: !4)
+!38 = !DILocation(line: 52, column: 37, scope: !4)
+!39 = !DILocation(line: 52, column: 4, scope: !4)

progress/github/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.source ADDED Viewed

	@@ -0,0 +1,230 @@

+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":18:0)
+#loc48 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
+#loc50 = loc(unknown)
+#loc53 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
+#loc57 = loc("in_ptr0"(#loc))
+#loc58 = loc("in_ptr1"(#loc))
+#loc59 = loc("in_ptr2"(#loc))
+#loc60 = loc("out_ptr1"(#loc))
+#loc61 = loc("xnumel"(#loc))
+#loc62 = loc("r0_numel"(#loc))
+#loc106 = loc("input"(#loc48))
+#loc107 = loc("a"(#loc53))
+#loc108 = loc("b"(#loc53))
+module {
+  tt.func public @triton_red_fused_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 47616 : i32 loc(#loc63)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc64)
+    %xoffset = tt.get_program_id x : i32 loc(#loc65)
+    %xoffset_2 = arith.constant 64 : i32 loc(#loc66)
+    %xoffset_3 = arith.constant 64 : i32 loc(#loc66)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc66)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc67)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc68)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc69)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc69)
+    %xmask = arith.constant dense<47616> : tensor<64x1xi32> loc(#loc70)
+    %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<64x1xi32> loc(#loc70)
+    %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc71)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc72)
+    %x0 = arith.constant 1488 : i32 loc(#loc73)
+    %x0_10 = arith.constant 1488 : i32 loc(#loc73)
+    %x0_11 = arith.constant dense<1488> : tensor<64x1xi32> loc(#loc73)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc73)
+    %x1 = arith.constant 1488 : i32 loc(#loc74)
+    %x1_13 = arith.constant 1488 : i32 loc(#loc74)
+    %x1_14 = arith.constant dense<1488> : tensor<64x1xi32> loc(#loc74)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc74)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc75)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc75)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c8_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16) -> (tensor<64x8xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc77)
+      %r0_index_24 = arith.addi %r0_index, %r0_base_9 : tensor<1x8xi32> loc(#loc77)
+      %r0_mask = arith.constant dense<128> : tensor<1x8xi32> loc(#loc78)
+      %r0_mask_25 = arith.cmpi slt, %r0_index_24, %r0_mask : tensor<1x8xi32> loc(#loc78)
+      %tmp0 = arith.constant 128 : i32 loc(#loc79)
+      %tmp0_26 = arith.constant 128 : i32 loc(#loc79)
+      %tmp0_27 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc79)
+      %tmp0_28 = arith.muli %tmp0_27, %x1_15 : tensor<64x1xi32> loc(#loc79)
+      %tmp0_29 = tt.broadcast %r0_index_24 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc80)
+      %tmp0_30 = tt.broadcast %tmp0_28 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc80)
+      %tmp0_31 = arith.addi %tmp0_29, %tmp0_30 : tensor<64x8xi32> loc(#loc80)
+      %tmp0_32 = arith.constant 4096 : i32 loc(#loc81)
+      %tmp0_33 = arith.constant 4096 : i32 loc(#loc81)
+      %tmp0_34 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc81)
+      %tmp0_35 = arith.muli %tmp0_34, %x0_12 : tensor<64x1xi32> loc(#loc81)
+      %tmp0_36 = tt.broadcast %tmp0_35 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc82)
+      %tmp0_37 = arith.addi %tmp0_31, %tmp0_36 : tensor<64x8xi32> loc(#loc82)
+      %tmp0_38 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc83)
+      %tmp0_39 = tt.addptr %tmp0_38, %tmp0_37 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc83)
+      %tmp0_40 = tt.broadcast %r0_mask_25 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc84)
+      %tmp0_41 = tt.broadcast %xmask_8 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc84)
+      %tmp0_42 = arith.andi %tmp0_40, %tmp0_41 : tensor<64x8xi1> loc(#loc84)
+      %tmp0_43 = arith.constant 0.000000e+00 : f32 loc(#loc85)
+      %tmp0_44 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc85)
+      %tmp0_45 = arith.truncf %tmp0_44 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc85)
+      %tmp0_46 = tt.load %tmp0_39, %tmp0_42, %tmp0_45 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc85)
+      %tmp0_47 = arith.extf %tmp0_46 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc86)
+      %tmp1 = arith.constant 128 : i32 loc(#loc87)
+      %tmp1_48 = arith.constant 128 : i32 loc(#loc87)
+      %tmp1_49 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc87)
+      %tmp1_50 = arith.muli %tmp1_49, %xindex_7 : tensor<64x1xi32> loc(#loc87)
+      %tmp1_51 = tt.broadcast %r0_index_24 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc88)
+      %tmp1_52 = tt.broadcast %tmp1_50 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc88)
+      %tmp1_53 = arith.addi %tmp1_51, %tmp1_52 : tensor<64x8xi32> loc(#loc88)
+      %tmp1_54 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc89)
+      %tmp1_55 = tt.addptr %tmp1_54, %tmp1_53 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc89)
+      %tmp1_56 = tt.broadcast %r0_mask_25 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc90)
+      %tmp1_57 = tt.broadcast %xmask_8 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc90)
+      %tmp1_58 = arith.andi %tmp1_56, %tmp1_57 : tensor<64x8xi1> loc(#loc90)
+      %tmp1_59 = arith.constant 0.000000e+00 : f32 loc(#loc91)
+      %tmp1_60 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc91)
+      %tmp1_61 = arith.truncf %tmp1_60 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc91)
+      %tmp1_62 = tt.load %tmp1_55, %tmp1_58, %tmp1_61 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc91)
+      %tmp1_63 = arith.extf %tmp1_62 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc92)
+      %tmp2 = arith.mulf %tmp0_47, %tmp1_63 : tensor<64x8xf32> loc(#loc93)
+      %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x8xf32> loc(#loc94)
+      %_tmp4_64 = tt.broadcast %r0_mask_25 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc95)
+      %_tmp4_65 = tt.broadcast %xmask_8 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc95)
+      %_tmp4_66 = arith.andi %_tmp4_64, %_tmp4_65 : tensor<64x8xi1> loc(#loc95)
+      %_tmp4_67 = arith.select %_tmp4_66, %tmp5, %_tmp4_23 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc96)
+      scf.yield %_tmp4_67 : tensor<64x8xf32> loc(#loc35)
+    } loc(#loc76)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc97)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc98)
+    %tmp7 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc99)
+    %tmp7_19 = tt.addptr %tmp7, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc99)
+    %tmp7_20 = tt.load %tmp7_19, %xmask_8 evictionPolicy = evict_last : tensor<64x1x!tt.ptr<f32>> loc(#loc100)
+    %tmp8 = arith.constant 0.693147182 : f32 loc(#loc101)
+    %tmp9 = arith.constant dense<0.693147182> : tensor<64x1xf32> loc(#loc102)
+    %tmp9_21 = arith.mulf %tmp7_20, %tmp9 : tensor<64x1xf32> loc(#loc102)
+    %tmp10 = arith.constant 1.44269502 : f32 loc(#loc103)
+    %tmp11 = arith.constant dense<1.44269502> : tensor<64x1xf32> loc(#loc104)
+    %tmp11_22 = arith.mulf %tmp9_21, %tmp11 : tensor<64x1xf32> loc(#loc104)
+    %tmp12 = arith.subf %tmp4_18, %tmp11_22 : tensor<64x1xf32> loc(#loc105)
+    %4 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc45)
+    %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc45)
+    tt.store %5, %tmp12, %xmask_8 : tensor<64x1x!tt.ptr<f32>> loc(#loc46)
+    tt.return loc(#loc47)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x8xf32> loc("input"(#loc48))) -> tensor<64xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc49)
+      tt.reduce.return %2 : f32 loc(#loc49)
+    }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc49)
+    tt.return %0 : tensor<64xf32> loc(#loc51)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<64xf32> loc(#loc52)
+    tt.return %1 : tensor<64xf32> loc(#loc52)
+  } loc(#loc48)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc53)), %b: f32 loc("b"(#loc53))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc54)
+    tt.return %0 : f32 loc(#loc55)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc56)
+    tt.return %1 : f32 loc(#loc56)
+  } loc(#loc53)
+} loc(#loc)
+#loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":19:13)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":20:15)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":23:28)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":23:33)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":24:36)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":24:44)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":24:23)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":25:21)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":26:27)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":26:37)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":28:19)
+#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":29:19)
+#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":31:43)
+#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":32:40)
+#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":33:31)
+#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":34:29)
+#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:45)
+#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:41)
+#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:55)
+#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:50)
+#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:34)
+#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:70)
+#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:60)
+#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:122)
+#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":39:45)
+#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":39:41)
+#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":39:34)
+#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":39:60)
+#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":39:50)
+#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":39:112)
+#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":40:22)
+#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":42:23)
+#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":43:35)
+#loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":43:48)
+#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":43:8)
+#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":44:25)
+#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":44:28)
+#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":45:30)
+#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":45:35)
+#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":47:11)
+#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":48:18)
+#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":49:12)
+#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":50:19)
+#loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":51:19)
+#loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":52:25)
+#loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":52:37)
+#loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":52:4)
+#loc49 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
+#loc51 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
+#loc52 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
+#loc54 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
+#loc55 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
+#loc56 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
+#loc63 = loc("xnumel"(#loc1))
+#loc64 = loc("r0_numel"(#loc2))
+#loc65 = loc("xoffset"(#loc3))
+#loc66 = loc("xoffset"(#loc4))
+#loc67 = loc("xindex"(#loc5))
+#loc68 = loc("xindex"(#loc6))
+#loc69 = loc("xindex"(#loc7))
+#loc70 = loc("xmask"(#loc8))
+#loc71 = loc("r0_base"(#loc9))
+#loc72 = loc("r0_base"(#loc10))
+#loc73 = loc("x0"(#loc11))
+#loc74 = loc("x1"(#loc12))
+#loc75 = loc("_tmp4"(#loc13))
+#loc76 = loc("_tmp4"(#loc14))
+#loc77 = loc("r0_index"(#loc15))
+#loc78 = loc("r0_mask"(#loc16))
+#loc79 = loc("tmp0"(#loc17))
+#loc80 = loc("tmp0"(#loc18))
+#loc81 = loc("tmp0"(#loc19))
+#loc82 = loc("tmp0"(#loc20))
+#loc83 = loc("tmp0"(#loc21))
+#loc84 = loc("tmp0"(#loc22))
+#loc85 = loc("tmp0"(#loc23))
+#loc86 = loc("tmp0"(#loc24))
+#loc87 = loc("tmp1"(#loc25))
+#loc88 = loc("tmp1"(#loc26))
+#loc89 = loc("tmp1"(#loc27))
+#loc90 = loc("tmp1"(#loc28))
+#loc91 = loc("tmp1"(#loc29))
+#loc92 = loc("tmp1"(#loc30))
+#loc93 = loc("tmp2"(#loc31))
+#loc94 = loc("tmp5"(#loc32))
+#loc95 = loc("_tmp4"(#loc33))
+#loc96 = loc("_tmp4"(#loc34))
+#loc97 = loc("tmp4"(#loc36))
+#loc98 = loc("tmp4"(#loc37))
+#loc99 = loc("tmp7"(#loc38))
+#loc100 = loc("tmp7"(#loc39))
+#loc101 = loc("tmp8"(#loc40))
+#loc102 = loc("tmp9"(#loc41))
+#loc103 = loc("tmp10"(#loc42))
+#loc104 = loc("tmp11"(#loc43))
+#loc105 = loc("tmp12"(#loc44))

progress/github/SpecForge/cache/compiled_kernels/triton/0/2KLHX5F2UZUCXBSUTWFXHP3MNYSJN73BED3D23CIU7RBRXFXBS3Q/triton_red_fused_mul_0.ttir ADDED Viewed

	@@ -0,0 +1,163 @@

+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":18:0)
+#loc1 = loc(unknown)
+#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":44:25)
+#loc44 = loc("in_ptr0"(#loc))
+#loc45 = loc("in_ptr1"(#loc))
+#loc46 = loc("in_ptr2"(#loc))
+#loc47 = loc("out_ptr1"(#loc))
+#loc48 = loc("xnumel"(#loc))
+#loc49 = loc("r0_numel"(#loc))
+#loc81 = loc("tmp4"(#loc35))
+#loc87 = loc(callsite(#loc1 at #loc81))
+module {
+  tt.func public @triton_red_fused_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc2)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc2)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc2)
+    %tmp11 = arith.constant dense<1.44269502> : tensor<64x1xf32> loc(#loc50)
+    %tmp9 = arith.constant dense<0.693147182> : tensor<64x1xf32> loc(#loc51)
+    %cst_0 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<1x8xi32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1488> : tensor<64x1xi32> loc(#loc1)
+    %xmask = arith.constant dense<47616> : tensor<64x1xi32> loc(#loc52)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc53)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc54)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc55)
+    %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc56)
+    %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc57)
+    %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc57)
+    %xmask_9 = arith.cmpi slt, %xindex_8, %xmask : tensor<64x1xi32> loc(#loc52)
+    %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc58)
+    %r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc59)
+    %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc60)
+    %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc61)
+    %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%_tmp4_16 = %cst_3) -> (tensor<64x8xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc63)
+      %r0_index_17 = arith.addi %r0_index, %r0_base_10 : tensor<1x8xi32> loc(#loc63)
+      %r0_mask = arith.cmpi slt, %r0_index_17, %cst_2 : tensor<1x8xi32> loc(#loc64)
+      %tmp0 = arith.muli %x1, %cst_1 : tensor<64x1xi32> loc(#loc65)
+      %tmp0_18 = tt.broadcast %r0_index_17 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc66)
+      %tmp0_19 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc66)
+      %tmp0_20 = arith.addi %tmp0_18, %tmp0_19 : tensor<64x8xi32> loc(#loc66)
+      %tmp0_21 = arith.muli %x0, %cst_0 : tensor<64x1xi32> loc(#loc67)
+      %tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc68)
+      %tmp0_23 = arith.addi %tmp0_20, %tmp0_22 : tensor<64x8xi32> loc(#loc68)
+      %tmp0_24 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc69)
+      %tmp0_25 = tt.addptr %tmp0_24, %tmp0_23 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc69)
+      %tmp0_26 = tt.broadcast %r0_mask : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc70)
+      %tmp0_27 = tt.broadcast %xmask_9 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc70)
+      %tmp0_28 = arith.andi %tmp0_26, %tmp0_27 : tensor<64x8xi1> loc(#loc70)
+      %tmp0_29 = tt.load %tmp0_25, %tmp0_28, %cst evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc71)
+      %tmp0_30 = arith.extf %tmp0_29 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc72)
+      %tmp1 = arith.muli %xindex_8, %cst_1 : tensor<64x1xi32> loc(#loc73)
+      %tmp1_31 = tt.broadcast %tmp1 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc74)
+      %tmp1_32 = arith.addi %tmp0_18, %tmp1_31 : tensor<64x8xi32> loc(#loc74)
+      %tmp1_33 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc75)
+      %tmp1_34 = tt.addptr %tmp1_33, %tmp1_32 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc75)
+      %tmp1_35 = tt.load %tmp1_34, %tmp0_28, %cst evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc76)
+      %tmp1_36 = arith.extf %tmp1_35 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc77)
+      %tmp2 = arith.mulf %tmp0_30, %tmp1_36 : tensor<64x8xf32> loc(#loc78)
+      %tmp5 = arith.addf %_tmp4_16, %tmp2 : tensor<64x8xf32> loc(#loc79)
+      %_tmp4_37 = arith.select %tmp0_28, %tmp5, %_tmp4_16 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc80)
+      scf.yield %_tmp4_37 : tensor<64x8xf32> loc(#loc33)
+    } loc(#loc62)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_16: f32 loc(callsite(#loc1 at #loc81)), %tmp4_17: f32 loc(callsite(#loc1 at #loc81))):
+      %tmp4_18 = arith.addf %tmp4_16, %tmp4_17 : f32 loc(#loc88)
+      tt.reduce.return %tmp4_18 : f32 loc(#loc86)
+    }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc86)
+    %tmp4_11 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc82)
+    %tmp7 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc83)
+    %tmp7_12 = tt.addptr %tmp7, %xindex_8 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc83)
+    %tmp7_13 = tt.load %tmp7_12, %xmask_9 evictionPolicy = evict_last : tensor<64x1x!tt.ptr<f32>> loc(#loc84)
+    %tmp9_14 = arith.mulf %tmp7_13, %tmp9 : tensor<64x1xf32> loc(#loc51)
+    %tmp11_15 = arith.mulf %tmp9_14, %tmp11 : tensor<64x1xf32> loc(#loc50)
+    %tmp12 = arith.subf %tmp4_11, %tmp11_15 : tensor<64x1xf32> loc(#loc85)
+    %0 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc41)
+    %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc41)
+    tt.store %1, %tmp12, %xmask_9 : tensor<64x1x!tt.ptr<f32>> loc(#loc42)
+    tt.return loc(#loc43)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":32:40)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":50:19)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":48:18)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":25:21)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":23:28)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":23:33)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":24:36)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":24:44)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":24:23)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":26:27)
+#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":26:37)
+#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":28:19)
+#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":29:19)
+#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":33:31)
+#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":34:29)
+#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:45)
+#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:41)
+#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:55)
+#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:50)
+#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:34)
+#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:70)
+#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:60)
+#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":38:122)
+#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":39:45)
+#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":39:41)
+#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":39:34)
+#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":39:50)
+#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":39:112)
+#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":40:22)
+#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":42:23)
+#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":43:48)
+#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":43:8)
+#loc34 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
+#loc36 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
+#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":44:28)
+#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":45:30)
+#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":45:35)
+#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":51:19)
+#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":52:25)
+#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":52:37)
+#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ck/cckqdjssfjga6vnnhmdsek4txynwhq3yriokhqxjxjhfyc7jmwm3.py":52:4)
+#loc50 = loc("tmp11"(#loc3))
+#loc51 = loc("tmp9"(#loc4))
+#loc52 = loc("xmask"(#loc5))
+#loc53 = loc("xoffset"(#loc6))
+#loc54 = loc("xoffset"(#loc7))
+#loc55 = loc("xindex"(#loc8))
+#loc56 = loc("xindex"(#loc9))
+#loc57 = loc("xindex"(#loc10))
+#loc58 = loc("r0_base"(#loc11))
+#loc59 = loc("r0_base"(#loc12))
+#loc60 = loc("x0"(#loc13))
+#loc61 = loc("x1"(#loc14))
+#loc62 = loc("_tmp4"(#loc2))
+#loc63 = loc("r0_index"(#loc15))
+#loc64 = loc("r0_mask"(#loc16))
+#loc65 = loc("tmp0"(#loc17))
+#loc66 = loc("tmp0"(#loc18))
+#loc67 = loc("tmp0"(#loc19))
+#loc68 = loc("tmp0"(#loc20))
+#loc69 = loc("tmp0"(#loc21))
+#loc70 = loc("tmp0"(#loc22))
+#loc71 = loc("tmp0"(#loc23))
+#loc72 = loc("tmp0"(#loc24))
+#loc73 = loc("tmp1"(#loc25))
+#loc74 = loc("tmp1"(#loc26))
+#loc75 = loc("tmp1"(#loc27))
+#loc76 = loc("tmp1"(#loc28))
+#loc77 = loc("tmp1"(#loc29))
+#loc78 = loc("tmp2"(#loc30))
+#loc79 = loc("tmp5"(#loc31))
+#loc80 = loc("_tmp4"(#loc32))
+#loc82 = loc("tmp4"(#loc37))
+#loc83 = loc("tmp7"(#loc38))
+#loc84 = loc("tmp7"(#loc39))
+#loc85 = loc("tmp12"(#loc40))
+#loc86 = loc(callsite(#loc34 at #loc81))
+#loc88 = loc(callsite(#loc36 at #loc86))

progress/github/SpecForge/cache/compiled_kernels/triton/0/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.llir ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/0/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ptx ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/0/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.source ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/0/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ttgir ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/0/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ttir ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/__grp__triton_tem_fused_0.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json"}}

progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"hash": "d6d8579bc758efaefba73018088fcd8f749b5f25c3ab12d02ac240e2311ac409", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 131072, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"}

progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir ADDED Viewed

	@@ -0,0 +1,936 @@

+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":18:0)
+#loc1 = loc(unknown)
+#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":520:16)
+#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":172:41)
+#loc83 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":403:51)
+#loc95 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":416:34)
+#loc131 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":198:45)
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}>
+#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+#loc152 = loc("arg_Q"(#loc))
+#loc153 = loc("arg_K"(#loc))
+#loc154 = loc("arg_V"(#loc))
+#loc155 = loc("arg_LSE"(#loc))
+#loc156 = loc("arg_MAX"(#loc))
+#loc157 = loc("arg_KV_NUM_BLKS"(#loc))
+#loc158 = loc("arg_KV_IDX"(#loc))
+#loc159 = loc("arg_FULL_KV_NUM_BLKS"(#loc))
+#loc160 = loc("arg_FULL_KV_IDX"(#loc))
+#loc161 = loc("out_ptr0"(#loc))
+#loc162 = loc("ks0"(#loc))
+#loc163 = loc("ks1"(#loc))
+#loc164 = loc("ks2"(#loc))
+#loc165 = loc("ks3"(#loc))
+#loc166 = loc("ks4"(#loc))
+#loc200 = loc(callsite(#loc41 at #loc42))
+#loc239 = loc("m_ij"(#loc83))
+#loc249 = loc("l_i"(#loc95))
+#loc283 = loc(callsite(#loc41 at #loc131))
+#loc345 = loc(callsite(#loc239 at #loc200))
+#loc355 = loc(callsite(#loc249 at #loc200))
+#loc374 = loc(callsite(#loc239 at #loc283))
+#loc384 = loc(callsite(#loc249 at #loc283))
+#loc406 = loc(callsite(#loc1 at #loc345))
+#loc408 = loc(callsite(#loc1 at #loc355))
+#loc436 = loc(callsite(#loc1 at #loc374))
+#loc438 = loc(callsite(#loc1 at #loc384))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1)
+    %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #mma> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<128x1xi32, #mma> loc(#loc1)
+    %cst_2 = arith.constant dense<1> : tensor<1x64xi32, #mma> loc(#loc1)
+    %cst_3 = arith.constant dense<false> : tensor<128x64xi1, #mma> loc(#loc1)
+    %cst_4 = arith.constant dense<16> : tensor<1x64xi32, #mma> loc(#loc1)
+    %cst_5 = arith.constant dense<16> : tensor<128x1xi32, #mma> loc(#loc1)
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1)
+    %cst_7 = arith.constant dense<1024> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_8 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1)
+    %cst_9 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %c2_i32 = arith.constant 2 : i32 loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %c32_i32 = arith.constant 32 : i32 loc(#loc1)
+    %c1_i32 = arith.constant 1 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %cst_10 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16, #blocked> loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16, #blocked> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %c63_i32 = arith.constant 63 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
+    %cst_13 = arith.constant dense<1.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1)
+    %cst_15 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1)
+    %cst_16 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1)
+    %cst_17 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1)
+    %cst_18 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
+    %c-1_i32 = arith.constant -1 : i32 loc(#loc1)
+    %c3_i32 = arith.constant 3 : i32 loc(#loc1)
+    %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc2)
+    %q_start = tt.get_program_id x : i32 loc(#loc167)
+    %off_zq = tt.get_program_id y : i32 loc(#loc168)
+    %off_hq = tt.get_program_id z : i32 loc(#loc169)
+    %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc170)
+    %q_offset = arith.muli %off_zq, %0 : i32 loc(#loc171)
+    %q_offset_19 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc172)
+    %q_offset_20 = arith.addi %q_offset, %q_offset_19 : i32 loc(#loc173)
+    %k_offset = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc174)
+    %Q = tt.addptr %arg_Q, %q_offset_20 : !tt.ptr<bf16>, i32 loc(#loc175)
+    %K = tt.addptr %arg_K, %k_offset : !tt.ptr<bf16>, i32 loc(#loc176)
+    %V = tt.addptr %arg_V, %k_offset : !tt.ptr<bf16>, i32 loc(#loc177)
+    %sparse_kv_idx_offset = arith.muli %q_start, %ks4 : i32 loc(#loc178)
+    %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc179)
+    %offs_m_21 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc180)
+    %offs_m_22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc180)
+    %offs_m_23 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked1> loc(#loc180)
+    %offs_m_24 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc181)
+    %offs_m_25 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc181)
+    %offs_m_26 = tt.splat %offs_m : i32 -> tensor<128xi32, #blocked1> loc(#loc181)
+    %offs_m_27 = arith.addi %offs_m_24, %offs_m_21 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc181)
+    %offs_m_28 = arith.addi %offs_m_25, %offs_m_22 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc181)
+    %offs_m_29 = arith.addi %offs_m_26, %offs_m_23 : tensor<128xi32, #blocked1> loc(#loc181)
+    %ptr = tt.expand_dims %offs_m_27 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc297)
+    %ptr_30 = tt.expand_dims %offs_m_28 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc297)
+    %ptr_31 = arith.muli %ptr, %cst_8 : tensor<128x1xi32, #blocked> loc(#loc298)
+    %ptr_32 = tt.splat %Q : !tt.ptr<bf16> -> tensor<128x1x!tt.ptr<bf16>, #blocked> loc(#loc299)
+    %ptr_33 = tt.addptr %ptr_32, %ptr_31 : tensor<128x1x!tt.ptr<bf16>, #blocked>, tensor<128x1xi32, #blocked> loc(#loc299)
+    %ptr_34 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc300)
+    %ptr_35 = tt.expand_dims %ptr_34 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc300)
+    %ptr_36 = tt.broadcast %ptr_33 : tensor<128x1x!tt.ptr<bf16>, #blocked> -> tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc301)
+    %ptr_37 = tt.broadcast %ptr_35 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc301)
+    %ptr_38 = tt.addptr %ptr_36, %ptr_37 : tensor<128x128x!tt.ptr<bf16>, #blocked>, tensor<128x128xi32, #blocked> loc(#loc301)
+    %q = tt.splat %ks0 : i32 -> tensor<128x1xi32, #blocked> loc(#loc302)
+    %q_39 = tt.splat %ks0 : i32 -> tensor<128x1xi32, #mma> loc(#loc302)
+    %q_40 = arith.cmpi slt, %ptr, %q : tensor<128x1xi32, #blocked> loc(#loc302)
+    %q_41 = tt.broadcast %q_40 : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc303)
+    %q_42 = tt.load %ptr_38, %q_41, %cst_10 : tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc303)
+    %q_43 = ttg.local_alloc %q_42 : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc303)
+    %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc188)
+    %kv_start = tt.load %kv_indices : !tt.ptr<i32> loc(#loc189)
+    %kv_start_44 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc190)
+    %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc191)
+    %kv_num_blocks_45 = tt.load %kv_num_blocks : !tt.ptr<i32> loc(#loc192)
+    %block_n_end = arith.muli %kv_num_blocks_45, %c2_i32 : i32 loc(#loc193)
+    %block_n_end_46 = arith.addi %ks1, %c63_i32 : i32 loc(#loc304)
+    %block_n_end_47 = arith.divsi %block_n_end_46, %c64_i32 : i32 loc(#loc305)
+    %block_n_end_48 = arith.maxsi %block_n_end_47, %c1_i32 : i32 loc(#loc195)
+    %block_n_end_49 = arith.minsi %block_n_end, %block_n_end_48 : i32 loc(#loc196)
+    %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc197)
+    %offs_n_50 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc197)
+    %offs_n_51 = tt.splat %kv_start_44 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198)
+    %offs_n_52 = arith.addi %offs_n_51, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198)
+    %1 = tt.expand_dims %offs_n_52 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc39)
+    %ptr_53 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>, #blocked> loc(#loc393)
+    %ptr_54 = tt.broadcast %ptr_35 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc394)
+    %k = tt.splat %ks1 : i32 -> tensor<64x1xi32, #blocked> loc(#loc395)
+    %m = arith.remsi %ptr_30, %q_39 : tensor<128x1xi32, #mma> loc(#loc396)
+    %n = tt.splat %ks1 : i32 -> tensor<1x64xi32, #mma> loc(#loc397)
+    %tmp3 = arith.cmpi slt, %m, %cst_0 : tensor<128x1xi32, #mma> loc(#loc309)
+    %tmp5 = tt.broadcast %m : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc310)
+    %tmp6 = tt.broadcast %tmp3 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc311)
+    %tmp7 = arith.cmpi sge, %m, %cst_0 : tensor<128x1xi32, #mma> loc(#loc312)
+    %tmp9 = tt.broadcast %tmp7 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc313)
+    %tmp14 = arith.remsi %m, %cst_5 : tensor<128x1xi32, #mma> loc(#loc314)
+    %tmp14_55 = arith.cmpi ne, %tmp14, %cst_0 : tensor<128x1xi32, #mma> loc(#loc315)
+    %tmp14_56 = arith.divsi %m, %cst_5 : tensor<128x1xi32, #mma> loc(#loc316)
+    %tmp14_57 = arith.subi %tmp14_56, %cst_1 : tensor<128x1xi32, #mma> loc(#loc317)
+    %tmp14_58 = arith.select %tmp14_55, %tmp14_57, %tmp14_56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc318)
+    %tmp14_59 = arith.select %tmp3, %tmp14_58, %tmp14_56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc319)
+    %tmp17 = tt.broadcast %tmp14_59 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc320)
+    %ptr_60 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>, #blocked> loc(#loc398)
+    %k_61 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc399)
+    %v = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc400)
+    %kv_offset = arith.cmpi sgt, %block_n_end_49, %c0_i32 : i32 loc(#loc462)
+    %offs_n_load = tt.splat %kv_start_44 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
+    %offs_n_load_62 = arith.addi %offs_n_load, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
+    %ptr_63 = tt.expand_dims %offs_n_load_62 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
+    %ptr_64 = arith.muli %ptr_63, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
+    %ptr_65 = tt.addptr %ptr_53, %ptr_64 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
+    %ptr_66 = tt.broadcast %ptr_65 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
+    %ptr_67 = tt.addptr %ptr_66, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
+    %k_68 = arith.cmpi slt, %ptr_63, %k : tensor<64x1xi32, #blocked> loc(#loc395)
+    %k_69 = tt.broadcast %k_68 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
+    %k_70 = ttg.memdesc_index %k_61[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
+    %kv_offset_71 = tt.splat %kv_offset : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
+    %kv_offset_72 = arith.andi %kv_offset_71, %k_69 : tensor<64x128xi1, #blocked> loc(#loc462)
+    %k_73 = ttg.async_copy_global_to_local %ptr_67, %k_70 mask %kv_offset_72 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
+    %k_74 = ttg.async_commit_group tokens %k_73 loc(#loc399)
+    %ptr_75 = tt.addptr %ptr_60, %ptr_64 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
+    %ptr_76 = tt.broadcast %ptr_75 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
+    %ptr_77 = tt.addptr %ptr_76, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
+    %v_78 = ttg.memdesc_index %v[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
+    %v_79 = ttg.async_copy_global_to_local %ptr_77, %v_78 mask %kv_offset_72 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
+    %v_80 = ttg.async_commit_group tokens %v_79 loc(#loc400)
+    %kv_offset_81 = arith.cmpi sgt, %block_n_end_49, %c1_i32 : i32 loc(#loc462)
+    %kv_base_offset = arith.addi %kv_start_44, %c64_i32 : i32 loc(#loc324)
+    %offs_n_load_82 = tt.splat %kv_base_offset : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
+    %offs_n_load_83 = arith.addi %offs_n_load_82, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
+    %ptr_84 = tt.expand_dims %offs_n_load_83 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
+    %ptr_85 = arith.muli %ptr_84, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
+    %ptr_86 = tt.addptr %ptr_53, %ptr_85 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
+    %ptr_87 = tt.broadcast %ptr_86 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
+    %ptr_88 = tt.addptr %ptr_87, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
+    %k_89 = arith.cmpi slt, %ptr_84, %k : tensor<64x1xi32, #blocked> loc(#loc395)
+    %k_90 = tt.broadcast %k_89 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
+    %k_91 = ttg.memdesc_index %k_61[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
+    %kv_offset_92 = tt.splat %kv_offset_81 : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
+    %kv_offset_93 = arith.andi %kv_offset_92, %k_90 : tensor<64x128xi1, #blocked> loc(#loc462)
+    %k_94 = ttg.async_copy_global_to_local %ptr_88, %k_91 mask %kv_offset_93 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
+    %k_95 = ttg.async_commit_group tokens %k_94 loc(#loc399)
+    %ptr_96 = tt.addptr %ptr_60, %ptr_85 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
+    %ptr_97 = tt.broadcast %ptr_96 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
+    %ptr_98 = tt.addptr %ptr_97, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
+    %v_99 = ttg.memdesc_index %v[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
+    %v_100 = ttg.async_copy_global_to_local %ptr_98, %v_99 mask %kv_offset_93 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
+    %v_101 = ttg.async_commit_group tokens %v_100 loc(#loc400)
+    ttng.fence_async_shared {bCluster = false} loc(#loc325)
+    %kv_offset_102:12 = scf.for %kv_offset_173 = %c0_i32 to %block_n_end_49 step %c1_i32 iter_args(%acc_174 = %cst_14, %arg17 = %cst_12, %arg18 = %cst_18, %arg19 = %c64_i32, %arg20 = %1, %arg21 = %c1_i32, %arg22 = %c-1_i32, %k_175 = %k_74, %k_176 = %k_95, %v_177 = %v_80, %v_178 = %v_101, %arg27 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32)  : i32 {
+      %kv_offset_179 = arith.subi %block_n_end_49, %c2_i32 : i32 loc(#loc462)
+      %kv_offset_180 = arith.cmpi slt, %kv_offset_173, %kv_offset_179 : i32 loc(#loc462)
+      %kv_offset_181 = arith.subi %block_n_end_49, %c1_i32 : i32 loc(#loc462)
+      %kv_offset_182 = arith.cmpi slt, %kv_offset_173, %kv_offset_181 : i32 loc(#loc462)
+      %kv_offset_183 = arith.addi %arg22, %c1_i32 : i32 loc(#loc462)
+      %kv_offset_184 = arith.cmpi sge, %kv_offset_183, %c3_i32 : i32 loc(#loc462)
+      %kv_offset_185 = arith.select %kv_offset_184, %c0_i32, %kv_offset_183 : i32 loc(#loc462)
+      %k_186 = ttg.async_wait %k_175, %v_177 {num = 2 : i32} loc(#loc399)
+      %k_187 = ttg.memdesc_index %k_61[%kv_offset_185] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
+      %k_188 = ttg.memdesc_trans %k_187 {order = array<i32: 1, 0>} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc326)
+      %qk = ttng.warp_group_dot %q_43, %k_188, %cst_6 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc325)
+      %qk_189:4 = ttng.warp_group_dot_wait %qk, %q_43, %k_188, %acc_174 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc325)
+      %qk_190 = arith.mulf %qk_189#0, %cst_15 : tensor<128x64xf32, #mma> loc(#loc327)
+      %n_191 = arith.remsi %arg20, %n : tensor<1x64xi32, #mma> loc(#loc397)
+      %post_mod_scores = arith.cmpi slt, %arg20, %n : tensor<1x64xi32, #mma> loc(#loc328)
+      %post_mod_scores_192 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc329)
+      %post_mod_scores_193 = arith.select %post_mod_scores_192, %qk_190, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc329)
+      %tmp5_194 = tt.broadcast %n_191 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc310)
+      %tmp5_195 = arith.cmpi sle, %tmp5_194, %tmp5 : tensor<128x64xi32, #mma> loc(#loc310)
+      %tmp6_196 = arith.andi %tmp6, %tmp5_195 : tensor<128x64xi1, #mma> loc(#loc311)
+      %tmp8 = arith.cmpi slt, %n_191, %cst : tensor<1x64xi32, #mma> loc(#loc330)
+      %tmp9_197 = tt.broadcast %tmp8 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc313)
+      %tmp9_198 = arith.andi %tmp9, %tmp9_197 : tensor<128x64xi1, #mma> loc(#loc313)
+      %tmp10 = arith.extui %tmp8 : tensor<1x64xi1, #mma> to tensor<1x64xi32, #mma> loc(#loc331)
+      %tmp10_199 = arith.cmpi eq, %tmp10, %cst : tensor<1x64xi32, #mma> loc(#loc331)
+      %tmp11 = tt.broadcast %tmp10_199 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc332)
+      %tmp11_200 = arith.andi %tmp9, %tmp11 : tensor<128x64xi1, #mma> loc(#loc332)
+      %tmp16 = arith.remsi %n_191, %cst_4 : tensor<1x64xi32, #mma> loc(#loc333)
+      %tmp16_201 = arith.cmpi ne, %tmp16, %cst : tensor<1x64xi32, #mma> loc(#loc334)
+      %tmp16_202 = arith.divsi %n_191, %cst_4 : tensor<1x64xi32, #mma> loc(#loc335)
+      %tmp16_203 = arith.subi %tmp16_202, %cst_2 : tensor<1x64xi32, #mma> loc(#loc336)
+      %tmp16_204 = arith.select %tmp16_201, %tmp16_203, %tmp16_202 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc337)
+      %tmp16_205 = arith.select %tmp8, %tmp16_204, %tmp16_202 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc338)
+      %tmp17_206 = tt.broadcast %tmp16_205 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc320)
+      %tmp17_207 = arith.cmpi eq, %tmp17, %tmp17_206 : tensor<128x64xi32, #mma> loc(#loc320)
+      %tmp18 = arith.andi %tmp11_200, %tmp17_207 : tensor<128x64xi1, #mma> loc(#loc339)
+      %tmp19 = arith.ori %tmp9_198, %tmp18 : tensor<128x64xi1, #mma> loc(#loc340)
+      %tmp20 = arith.ori %tmp6_196, %tmp19 : tensor<128x64xi1, #mma> loc(#loc341)
+      %mask_mod_output = arith.select %post_mod_scores_192, %tmp20, %cst_3 : tensor<128x64xi1, #mma>, tensor<128x64xi1, #mma> loc(#loc342)
+      %post_mod_scores_208 = arith.select %mask_mod_output, %post_mod_scores_193, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc343)
+      %post_mod_scores_209 = arith.mulf %post_mod_scores_208, %cst_17 : tensor<128x64xf32, #mma> loc(#loc344)
+      %m_ij = "tt.reduce"(%post_mod_scores_209) <{axis = 1 : i32}> ({
+      ^bb0(%m_ij_267: f32 loc(callsite(#loc1 at #loc345)), %m_ij_268: f32 loc(callsite(#loc1 at #loc345))):
+        %m_ij_269 = arith.maxnumf %m_ij_267, %m_ij_268 : f32 loc(#loc457)
+        tt.reduce.return %m_ij_269 : f32 loc(#loc405)
+      }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc405)
+      %m_ij_210 = arith.maxnumf %arg18, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc346)
+      %masked_out_rows = arith.cmpf oeq, %m_ij_210, %cst_18 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc347)
+      %m_ij_masked = arith.select %masked_out_rows, %cst_12, %m_ij_210 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc348)
+      %alpha = arith.subf %arg18, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc349)
+      %alpha_211 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc350)
+      %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc351)
+      %p_212 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc352)
+      %p_213 = arith.subf %post_mod_scores_209, %p_212 : tensor<128x64xf32, #mma> loc(#loc352)
+      %p_214 = math.exp2 %p_213 : tensor<128x64xf32, #mma> loc(#loc353)
+      %l_i_215 = arith.mulf %arg17, %alpha_211 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc354)
+      %l_i_216 = "tt.reduce"(%p_214) <{axis = 1 : i32}> ({
+      ^bb0(%l_i_267: f32 loc(callsite(#loc1 at #loc355)), %l_i_268: f32 loc(callsite(#loc1 at #loc355))):
+        %l_i_269 = arith.addf %l_i_267, %l_i_268 : f32 loc(#loc458)
+        tt.reduce.return %l_i_269 : f32 loc(#loc407)
+      }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc407)
+      %l_i_217 = arith.addf %l_i_215, %l_i_216 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc356)
+      %acc_218 = tt.expand_dims %alpha_211 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc357)
+      %acc_219 = ttg.convert_layout %acc_218 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc358)
+      %acc_220 = tt.broadcast %acc_219 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc358)
+      %acc_221 = arith.mulf %qk_189#3, %acc_220 : tensor<128x128xf32, #mma1> loc(#loc358)
+      %v_222 = ttg.memdesc_index %v[%kv_offset_185] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
+      %acc_223 = arith.truncf %p_214 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc359)
+      %acc_224 = ttg.convert_layout %acc_223 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc359)
+      %acc_225 = ttng.warp_group_dot %acc_224, %v_222, %acc_221 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc360)
+      %offs_n_226 = tt.splat %arg27 : i32 -> tensor<1x64xi32, #mma> loc(#loc361)
+      %offs_n_227 = arith.addi %arg20, %offs_n_226 : tensor<1x64xi32, #mma> loc(#loc361)
+      %kv_offset_228 = arith.addi %kv_offset_173, %c1_i32 : i32 loc(#loc462)
+      %cur_block_idx = arith.divsi %kv_offset_228, %c2_i32 : i32 loc(#loc409)
+      %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc410)
+      %cur_block_229 = tt.load %cur_block, %kv_offset_182 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc411)
+      %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc412)
+      %next_block_230 = arith.cmpi slt, %next_block, %kv_num_blocks_45 : i32 loc(#loc413)
+      %next_block_231 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc414)
+      %kv_offset_232 = arith.andi %kv_offset_182, %next_block_230 : i1 loc(#loc462)
+      %next_block_233 = tt.load %next_block_231, %kv_offset_232 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc415)
+      %needs_jump = arith.addi %kv_offset_173, %c2_i32 : i32 loc(#loc416)
+      %needs_jump_234 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc417)
+      %needs_jump_235 = arith.cmpi eq, %needs_jump_234, %c0_i32 : i32 loc(#loc418)
+      %jump_to_block = arith.subi %next_block_233, %cur_block_229 : i32 loc(#loc419)
+      %jump_to_block_236 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc420)
+      %jump_to_block_237 = arith.subi %jump_to_block_236, %c64_i32 : i32 loc(#loc421)
+      %offset = arith.extui %needs_jump_235 : i1 to i32 loc(#loc422)
+      %offset_238 = arith.muli %jump_to_block_237, %offset : i32 loc(#loc422)
+      %offset_239 = arith.subi %c1_i32, %offset : i32 loc(#loc423)
+      %offset_240 = arith.muli %offset_239, %c64_i32 : i32 loc(#loc424)
+      %offset_241 = arith.addi %offset_238, %offset_240 : i32 loc(#loc425)
+      %kv_offset_242 = arith.addi %arg19, %offset_241 : i32 loc(#loc363)
+      %kv_offset_243 = arith.addi %arg21, %c1_i32 : i32 loc(#loc462)
+      %kv_offset_244 = arith.cmpi sge, %kv_offset_243, %c3_i32 : i32 loc(#loc462)
+      %kv_offset_245 = arith.select %kv_offset_244, %c0_i32, %kv_offset_243 : i32 loc(#loc462)
+      %kv_base_offset_246 = arith.addi %kv_start_44, %kv_offset_242 : i32 loc(#loc324)
+      %offs_n_load_247 = tt.splat %kv_base_offset_246 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
+      %offs_n_load_248 = arith.addi %offs_n_load_247, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
+      %ptr_249 = tt.expand_dims %offs_n_load_248 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
+      %ptr_250 = arith.muli %ptr_249, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
+      %ptr_251 = tt.addptr %ptr_53, %ptr_250 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
+      %ptr_252 = tt.broadcast %ptr_251 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
+      %ptr_253 = tt.addptr %ptr_252, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
+      %k_254 = arith.cmpi slt, %ptr_249, %k : tensor<64x1xi32, #blocked> loc(#loc395)
+      %k_255 = tt.broadcast %k_254 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
+      %k_256 = ttg.memdesc_index %k_61[%kv_offset_245] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
+      %kv_offset_257 = tt.splat %kv_offset_180 : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
+      %kv_offset_258 = arith.andi %kv_offset_257, %k_255 : tensor<64x128xi1, #blocked> loc(#loc462)
+      %k_259 = ttg.async_copy_global_to_local %ptr_253, %k_256 mask %kv_offset_258 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
+      %k_260 = ttg.async_commit_group tokens %k_259 loc(#loc399)
+      %ptr_261 = tt.addptr %ptr_60, %ptr_250 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
+      %ptr_262 = tt.broadcast %ptr_261 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
+      %ptr_263 = tt.addptr %ptr_262, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
+      %v_264 = ttg.memdesc_index %v[%kv_offset_245] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
+      %v_265 = ttg.async_copy_global_to_local %ptr_263, %v_264 mask %kv_offset_258 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
+      %v_266 = ttg.async_commit_group tokens %v_265 loc(#loc400)
+      scf.yield %acc_225, %l_i_217, %m_ij_210, %kv_offset_242, %offs_n_227, %kv_offset_245, %kv_offset_185, %k_176, %k_260, %v_178, %v_266, %offset_241 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc462)
+    } loc(#loc462)
+    %kv_offset_103 = ttng.warp_group_dot_wait %kv_offset_102#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc462)
+    %kv_offset_104 = ttg.async_wait {num = 0 : i32} loc(#loc462)
+    ttg.local_dealloc %v : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc462)
+    ttg.local_dealloc %k_61 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc462)
+    %kv_indices_105 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc275)
+    %kv_start_106 = tt.load %kv_indices_105 : !tt.ptr<i32> loc(#loc276)
+    %kv_start_107 = arith.muli %kv_start_106, %c128_i32 : i32 loc(#loc277)
+    %kv_num_blocks_108 = tt.addptr %arg_FULL_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc278)
+    %kv_num_blocks_109 = tt.load %kv_num_blocks_108 : !tt.ptr<i32> loc(#loc279)
+    %block_n_end_110 = arith.muli %kv_num_blocks_109, %c2_i32 : i32 loc(#loc280)
+    %block_n_end_111 = arith.minsi %block_n_end_110, %block_n_end_48 : i32 loc(#loc281)
+    %offs_n_112 = tt.splat %kv_start_107 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc282)
+    %offs_n_113 = arith.addi %offs_n_112, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc282)
+    %2 = tt.expand_dims %offs_n_113 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc130)
+    %k_114 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc426)
+    %v_115 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc427)
+    %kv_offset_116 = arith.cmpi sgt, %block_n_end_111, %c0_i32 : i32 loc(#loc463)
+    %offs_n_load_117 = tt.splat %kv_start_107 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
+    %offs_n_load_118 = arith.addi %offs_n_load_117, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
+    %ptr_119 = tt.expand_dims %offs_n_load_118 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
+    %ptr_120 = arith.muli %ptr_119, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
+    %ptr_121 = tt.addptr %ptr_53, %ptr_120 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
+    %ptr_122 = tt.broadcast %ptr_121 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
+    %ptr_123 = tt.addptr %ptr_122, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
+    %k_124 = arith.cmpi slt, %ptr_119, %k : tensor<64x1xi32, #blocked> loc(#loc432)
+    %k_125 = tt.broadcast %k_124 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
+    %k_126 = ttg.memdesc_index %k_114[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
+    %kv_offset_127 = tt.splat %kv_offset_116 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
+    %kv_offset_128 = arith.andi %kv_offset_127, %k_125 : tensor<64x128xi1, #blocked> loc(#loc463)
+    %k_129 = ttg.async_copy_global_to_local %ptr_123, %k_126 mask %kv_offset_128 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
+    %k_130 = ttg.async_commit_group tokens %k_129 loc(#loc426)
+    %ptr_131 = tt.addptr %ptr_60, %ptr_120 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
+    %ptr_132 = tt.broadcast %ptr_131 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
+    %ptr_133 = tt.addptr %ptr_132, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
+    %v_134 = ttg.memdesc_index %v_115[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
+    %v_135 = ttg.async_copy_global_to_local %ptr_133, %v_134 mask %kv_offset_128 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
+    %v_136 = ttg.async_commit_group tokens %v_135 loc(#loc427)
+    %kv_offset_137 = arith.cmpi sgt, %block_n_end_111, %c1_i32 : i32 loc(#loc463)
+    %kv_base_offset_138 = arith.addi %kv_start_107, %c64_i32 : i32 loc(#loc367)
+    %offs_n_load_139 = tt.splat %kv_base_offset_138 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
+    %offs_n_load_140 = arith.addi %offs_n_load_139, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
+    %ptr_141 = tt.expand_dims %offs_n_load_140 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
+    %ptr_142 = arith.muli %ptr_141, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
+    %ptr_143 = tt.addptr %ptr_53, %ptr_142 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
+    %ptr_144 = tt.broadcast %ptr_143 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
+    %ptr_145 = tt.addptr %ptr_144, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
+    %k_146 = arith.cmpi slt, %ptr_141, %k : tensor<64x1xi32, #blocked> loc(#loc432)
+    %k_147 = tt.broadcast %k_146 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
+    %k_148 = ttg.memdesc_index %k_114[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
+    %kv_offset_149 = tt.splat %kv_offset_137 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
+    %kv_offset_150 = arith.andi %kv_offset_149, %k_147 : tensor<64x128xi1, #blocked> loc(#loc463)
+    %k_151 = ttg.async_copy_global_to_local %ptr_145, %k_148 mask %kv_offset_150 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
+    %k_152 = ttg.async_commit_group tokens %k_151 loc(#loc426)
+    %ptr_153 = tt.addptr %ptr_60, %ptr_142 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
+    %ptr_154 = tt.broadcast %ptr_153 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
+    %ptr_155 = tt.addptr %ptr_154, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
+    %v_156 = ttg.memdesc_index %v_115[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
+    %v_157 = ttg.async_copy_global_to_local %ptr_155, %v_156 mask %kv_offset_150 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
+    %v_158 = ttg.async_commit_group tokens %v_157 loc(#loc427)
+    ttng.fence_async_shared {bCluster = false} loc(#loc368)
+    %kv_offset_159:12 = scf.for %kv_offset_173 = %c0_i32 to %block_n_end_111 step %c1_i32 iter_args(%kv_offset_174 = %kv_offset_103, %kv_offset_175 = %kv_offset_102#1, %kv_offset_176 = %kv_offset_102#2, %arg19 = %c64_i32, %arg20 = %2, %arg21 = %c1_i32, %arg22 = %c-1_i32, %k_177 = %k_130, %k_178 = %k_152, %v_179 = %v_136, %v_180 = %v_158, %arg27 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32)  : i32 {
+      %kv_offset_181 = arith.subi %block_n_end_111, %c2_i32 : i32 loc(#loc463)
+      %kv_offset_182 = arith.cmpi slt, %kv_offset_173, %kv_offset_181 : i32 loc(#loc463)
+      %kv_offset_183 = arith.subi %block_n_end_111, %c1_i32 : i32 loc(#loc463)
+      %kv_offset_184 = arith.cmpi slt, %kv_offset_173, %kv_offset_183 : i32 loc(#loc463)
+      %kv_offset_185 = arith.addi %arg22, %c1_i32 : i32 loc(#loc463)
+      %kv_offset_186 = arith.cmpi sge, %kv_offset_185, %c3_i32 : i32 loc(#loc463)
+      %kv_offset_187 = arith.select %kv_offset_186, %c0_i32, %kv_offset_185 : i32 loc(#loc463)
+      %k_188 = ttg.async_wait %k_177, %v_179 {num = 2 : i32} loc(#loc426)
+      %k_189 = ttg.memdesc_index %k_114[%kv_offset_187] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
+      %k_190 = ttg.memdesc_trans %k_189 {order = array<i32: 1, 0>} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc369)
+      %qk = ttng.warp_group_dot %q_43, %k_190, %cst_6 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc368)
+      %qk_191:4 = ttng.warp_group_dot_wait %qk, %q_43, %k_190, %kv_offset_174 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc368)
+      %qk_192 = arith.mulf %qk_191#0, %cst_15 : tensor<128x64xf32, #mma> loc(#loc370)
+      %post_mod_scores = arith.cmpi slt, %arg20, %n : tensor<1x64xi32, #mma> loc(#loc371)
+      %post_mod_scores_193 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc372)
+      %post_mod_scores_194 = arith.select %post_mod_scores_193, %qk_192, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc372)
+      %post_mod_scores_195 = arith.mulf %post_mod_scores_194, %cst_17 : tensor<128x64xf32, #mma> loc(#loc373)
+      %m_ij = "tt.reduce"(%post_mod_scores_195) <{axis = 1 : i32}> ({
+      ^bb0(%m_ij_253: f32 loc(callsite(#loc1 at #loc374)), %m_ij_254: f32 loc(callsite(#loc1 at #loc374))):
+        %m_ij_255 = arith.maxnumf %m_ij_253, %m_ij_254 : f32 loc(#loc459)
+        tt.reduce.return %m_ij_255 : f32 loc(#loc435)
+      }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc435)
+      %m_ij_196 = arith.maxnumf %kv_offset_176, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc375)
+      %masked_out_rows = arith.cmpf oeq, %m_ij_196, %cst_18 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc376)
+      %m_ij_masked = arith.select %masked_out_rows, %cst_12, %m_ij_196 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc377)
+      %alpha = arith.subf %kv_offset_176, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc378)
+      %alpha_197 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc379)
+      %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc380)
+      %p_198 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc381)
+      %p_199 = arith.subf %post_mod_scores_195, %p_198 : tensor<128x64xf32, #mma> loc(#loc381)
+      %p_200 = math.exp2 %p_199 : tensor<128x64xf32, #mma> loc(#loc382)
+      %l_i_201 = arith.mulf %kv_offset_175, %alpha_197 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc383)
+      %l_i_202 = "tt.reduce"(%p_200) <{axis = 1 : i32}> ({
+      ^bb0(%l_i_253: f32 loc(callsite(#loc1 at #loc384)), %l_i_254: f32 loc(callsite(#loc1 at #loc384))):
+        %l_i_255 = arith.addf %l_i_253, %l_i_254 : f32 loc(#loc460)
+        tt.reduce.return %l_i_255 : f32 loc(#loc437)
+      }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc437)
+      %l_i_203 = arith.addf %l_i_201, %l_i_202 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc385)
+      %acc_204 = tt.expand_dims %alpha_197 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc386)
+      %acc_205 = ttg.convert_layout %acc_204 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc387)
+      %acc_206 = tt.broadcast %acc_205 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc387)
+      %acc_207 = arith.mulf %qk_191#3, %acc_206 : tensor<128x128xf32, #mma1> loc(#loc387)
+      %v_208 = ttg.memdesc_index %v_115[%kv_offset_187] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
+      %acc_209 = arith.truncf %p_200 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc388)
+      %acc_210 = ttg.convert_layout %acc_209 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc388)
+      %acc_211 = ttng.warp_group_dot %acc_210, %v_208, %acc_207 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc389)
+      %offs_n_212 = tt.splat %arg27 : i32 -> tensor<1x64xi32, #mma> loc(#loc390)
+      %offs_n_213 = arith.addi %arg20, %offs_n_212 : tensor<1x64xi32, #mma> loc(#loc390)
+      %kv_offset_214 = arith.addi %kv_offset_173, %c1_i32 : i32 loc(#loc463)
+      %cur_block_idx = arith.divsi %kv_offset_214, %c2_i32 : i32 loc(#loc439)
+      %cur_block = tt.addptr %kv_indices_105, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc440)
+      %cur_block_215 = tt.load %cur_block, %kv_offset_184 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc441)
+      %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc442)
+      %next_block_216 = arith.cmpi slt, %next_block, %kv_num_blocks_109 : i32 loc(#loc443)
+      %next_block_217 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc444)
+      %kv_offset_218 = arith.andi %kv_offset_184, %next_block_216 : i1 loc(#loc463)
+      %next_block_219 = tt.load %next_block_217, %kv_offset_218 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc445)
+      %needs_jump = arith.addi %kv_offset_173, %c2_i32 : i32 loc(#loc446)
+      %needs_jump_220 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc447)
+      %needs_jump_221 = arith.cmpi eq, %needs_jump_220, %c0_i32 : i32 loc(#loc448)
+      %jump_to_block = arith.subi %next_block_219, %cur_block_215 : i32 loc(#loc449)
+      %jump_to_block_222 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc450)
+      %jump_to_block_223 = arith.subi %jump_to_block_222, %c64_i32 : i32 loc(#loc451)
+      %offset = arith.extui %needs_jump_221 : i1 to i32 loc(#loc452)
+      %offset_224 = arith.muli %jump_to_block_223, %offset : i32 loc(#loc452)
+      %offset_225 = arith.subi %c1_i32, %offset : i32 loc(#loc453)
+      %offset_226 = arith.muli %offset_225, %c64_i32 : i32 loc(#loc454)
+      %offset_227 = arith.addi %offset_224, %offset_226 : i32 loc(#loc455)
+      %kv_offset_228 = arith.addi %arg19, %offset_227 : i32 loc(#loc392)
+      %kv_offset_229 = arith.addi %arg21, %c1_i32 : i32 loc(#loc463)
+      %kv_offset_230 = arith.cmpi sge, %kv_offset_229, %c3_i32 : i32 loc(#loc463)
+      %kv_offset_231 = arith.select %kv_offset_230, %c0_i32, %kv_offset_229 : i32 loc(#loc463)
+      %kv_base_offset_232 = arith.addi %kv_start_107, %kv_offset_228 : i32 loc(#loc367)
+      %offs_n_load_233 = tt.splat %kv_base_offset_232 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
+      %offs_n_load_234 = arith.addi %offs_n_load_233, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
+      %ptr_235 = tt.expand_dims %offs_n_load_234 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
+      %ptr_236 = arith.muli %ptr_235, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
+      %ptr_237 = tt.addptr %ptr_53, %ptr_236 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
+      %ptr_238 = tt.broadcast %ptr_237 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
+      %ptr_239 = tt.addptr %ptr_238, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
+      %k_240 = arith.cmpi slt, %ptr_235, %k : tensor<64x1xi32, #blocked> loc(#loc432)
+      %k_241 = tt.broadcast %k_240 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
+      %k_242 = ttg.memdesc_index %k_114[%kv_offset_231] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
+      %kv_offset_243 = tt.splat %kv_offset_182 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
+      %kv_offset_244 = arith.andi %kv_offset_243, %k_241 : tensor<64x128xi1, #blocked> loc(#loc463)
+      %k_245 = ttg.async_copy_global_to_local %ptr_239, %k_242 mask %kv_offset_244 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
+      %k_246 = ttg.async_commit_group tokens %k_245 loc(#loc426)
+      %ptr_247 = tt.addptr %ptr_60, %ptr_236 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
+      %ptr_248 = tt.broadcast %ptr_247 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
+      %ptr_249 = tt.addptr %ptr_248, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
+      %v_250 = ttg.memdesc_index %v_115[%kv_offset_231] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
+      %v_251 = ttg.async_copy_global_to_local %ptr_249, %v_250 mask %kv_offset_244 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
+      %v_252 = ttg.async_commit_group tokens %v_251 loc(#loc427)
+      scf.yield %acc_211, %l_i_203, %m_ij_196, %kv_offset_228, %offs_n_213, %kv_offset_231, %kv_offset_187, %k_178, %k_246, %v_180, %v_252, %offset_227 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc463)
+    } loc(#loc463)
+    %kv_offset_160 = ttng.warp_group_dot_wait %kv_offset_159#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc463)
+    %kv_offset_161 = ttg.async_wait {num = 0 : i32} loc(#loc463)
+    ttg.local_dealloc %v_115 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc463)
+    ttg.local_dealloc %k_114 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc463)
+    %l_i = arith.cmpf oeq, %kv_offset_159#1, %cst_12 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc284)
+    %l_i_162 = arith.select %l_i, %cst_13, %kv_offset_159#1 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc285)
+    %acc = tt.expand_dims %l_i_162 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc286)
+    %acc_163 = ttg.convert_layout %acc : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc287)
+    %acc_164 = tt.broadcast %acc_163 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc287)
+    %acc_165 = arith.divf %kv_offset_160, %acc_164 : tensor<128x128xf32, #mma1> loc(#loc287)
+    %mask = arith.cmpi slt, %ptr_35, %cst_9 : tensor<1x128xi32, #blocked> loc(#loc288)
+    %mask_166 = tt.broadcast %mask : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc289)
+    %mask_167 = arith.andi %q_41, %mask_166 : tensor<128x128xi1, #blocked> loc(#loc289)
+    %3 = tt.splat %q_offset_19 : i32 -> tensor<1x128xi32, #blocked> loc(#loc138)
+    %4 = arith.addi %ptr_35, %3 : tensor<1x128xi32, #blocked> loc(#loc138)
+    %5 = tt.broadcast %4 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc139)
+    %6 = tt.broadcast %ptr_31 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc139)
+    %7 = arith.addi %5, %6 : tensor<128x128xi32, #blocked> loc(#loc139)
+    %8 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc140)
+    %9 = tt.addptr %8, %7 : tensor<128x128x!tt.ptr<bf16>, #blocked>, tensor<128x128xi32, #blocked> loc(#loc140)
+    %10 = arith.truncf %acc_165 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc141)
+    %11 = ttg.convert_layout %10 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc141)
+    tt.store %9, %11, %mask_167 : tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc141)
+    %off_hz = arith.muli %off_zq, %c32_i32 : i32 loc(#loc290)
+    %off_hz_168 = arith.addi %off_hz, %off_hq : i32 loc(#loc291)
+    %l_ptrs = arith.muli %off_hz_168, %ks0 : i32 loc(#loc292)
+    %l_ptrs_169 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr<f32>, i32 loc(#loc293)
+    %l_ptrs_170 = tt.splat %l_ptrs_169 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked1> loc(#loc294)
+    %l_ptrs_171 = tt.addptr %l_ptrs_170, %offs_m_29 : tensor<128x!tt.ptr<f32>, #blocked1>, tensor<128xi32, #blocked1> loc(#loc294)
+    %lse = math.log2 %l_i_162 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc295)
+    %lse_172 = arith.addf %kv_offset_159#2, %lse : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc296)
+    %12 = tt.splat %ks0 : i32 -> tensor<128xi32, #blocked1> loc(#loc149)
+    %13 = arith.cmpi slt, %offs_m_29, %12 : tensor<128xi32, #blocked1> loc(#loc149)
+    %14 = ttg.convert_layout %lse_172 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128xf32, #blocked1> loc(#loc150)
+    tt.store %l_ptrs_171, %14, %13 : tensor<128x!tt.ptr<f32>, #blocked1> loc(#loc150)
+    tt.return loc(#loc151)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":85:54)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":97:28)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":98:27)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":99:27)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":104:24)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":107:24)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":107:45)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":107:36)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":108:47)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":111:12)
+#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":112:12)
+#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":113:12)
+#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":143:97)
+#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":144:23)
+#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":144:46)
+#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":144:33)
+#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":284:27)
+#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":146:101)
+#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":284:38)
+#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":284:20)
+#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":284:56)
+#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":284:49)
+#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":292:52)
+#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":292:23)
+#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":151:26)
+#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":152:23)
+#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":152:37)
+#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":153:42)
+#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":153:28)
+#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":154:45)
+#loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22)
+#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":154:92)
+#loc34 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28)
+#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":154:102)
+#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":154:65)
+#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":159:37)
+#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":159:24)
+#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":167:48)
+#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":347:107)
+#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":257:21)
+#loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":358:36)
+#loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":359:36)
+#loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":372:22)
+#loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":374:23)
+#loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":375:22)
+#loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":376:23)
+#loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":378:22)
+#loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:70)
+#loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:79)
+#loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:91)
+#loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:99)
+#loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:102)
+#loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:119)
+#loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":386:25)
+#loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":421:107)
+#loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":484:40)
+#loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":346:35)
+#loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":342:32)
+#loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":351:19)
+#loc63 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":349:17)
+#loc64 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":353:14)
+#loc65 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":367:44)
+#loc66 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":367:69)
+#loc67 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":377:22)
+#loc68 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":379:24)
+#loc69 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":380:23)
+#loc70 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:70)
+#loc71 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:79)
+#loc72 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:91)
+#loc73 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:99)
+#loc74 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:102)
+#loc75 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:119)
+#loc76 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":387:24)
+#loc77 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":388:23)
+#loc78 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":389:23)
+#loc79 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":394:73)
+#loc80 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":396:69)
+#loc81 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":399:27)
+#loc82 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40)
+#loc84 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27)
+#loc85 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":403:27)
+#loc86 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":405:35)
+#loc87 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":406:51)
+#loc88 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":410:31)
+#loc89 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":410:25)
+#loc90 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":411:51)
+#loc91 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":411:39)
+#loc92 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":411:21)
+#loc93 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":416:16)
+#loc94 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
+#loc96 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
+#loc97 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":416:24)
+#loc98 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":418:22)
+#loc99 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":418:16)
+#loc100 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":422:22)
+#loc101 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":422:44)
+#loc102 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":530:26)
+#loc103 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":247:33)
+#loc104 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":527:63)
+#loc105 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":248:38)
+#loc106 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":248:24)
+#loc107 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":249:109)
+#loc108 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":249:113)
+#loc109 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":249:55)
+#loc110 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":249:25)
+#loc111 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":250:30)
+#loc112 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":250:35)
+#loc113 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":250:60)
+#loc114 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":251:34)
+#loc115 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":251:48)
+#loc116 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":251:63)
+#loc117 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":252:29)
+#loc118 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":252:47)
+#loc119 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":252:61)
+#loc120 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":252:42)
+#loc121 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":531:21)
+#loc122 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":181:35)
+#loc123 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":182:27)
+#loc124 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":182:41)
+#loc125 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":183:51)
+#loc126 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":183:32)
+#loc127 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":184:49)
+#loc128 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":184:69)
+#loc129 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":186:28)
+#loc130 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":193:52)
+#loc132 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":206:26)
+#loc133 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":206:34)
+#loc134 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":208:20)
+#loc135 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":208:16)
+#loc136 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":214:38)
+#loc137 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":214:30)
+#loc138 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":218:49)
+#loc139 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":218:62)
+#loc140 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":218:25)
+#loc141 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":218:92)
+#loc142 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":221:26)
+#loc143 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":221:31)
+#loc144 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":222:32)
+#loc145 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":222:23)
+#loc146 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":222:40)
+#loc147 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":223:33)
+#loc148 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":223:20)
+#loc149 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":227:48)
+#loc150 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":227:29)
+#loc151 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":229:4)
+#loc167 = loc("q_start"(#loc3))
+#loc168 = loc("off_zq"(#loc4))
+#loc169 = loc("off_hq"(#loc5))
+#loc170 = loc("off_hkv"(#loc6))
+#loc171 = loc("q_offset"(#loc7))
+#loc172 = loc("q_offset"(#loc8))
+#loc173 = loc("q_offset"(#loc9))
+#loc174 = loc("k_offset"(#loc10))
+#loc175 = loc("Q"(#loc11))
+#loc176 = loc("K"(#loc12))
+#loc177 = loc("V"(#loc13))
+#loc178 = loc("sparse_kv_idx_offset"(#loc14))
+#loc179 = loc("offs_m"(#loc15))
+#loc180 = loc("offs_m"(#loc16))
+#loc181 = loc("offs_m"(#loc17))
+#loc182 = loc("ptr"(#loc18))
+#loc183 = loc("q"(#loc19))
+#loc184 = loc("ptr"(#loc20))
+#loc185 = loc("ptr"(#loc21))
+#loc186 = loc("ptr"(#loc22))
+#loc187 = loc("ptr"(#loc23))
+#loc188 = loc("kv_indices"(#loc26))
+#loc189 = loc("kv_start"(#loc27))
+#loc190 = loc("kv_start"(#loc28))
+#loc191 = loc("kv_num_blocks"(#loc29))
+#loc192 = loc("kv_num_blocks"(#loc30))
+#loc193 = loc("block_n_end"(#loc31))
+#loc194 = loc("block_n_end"(#loc33))
+#loc195 = loc("block_n_end"(#loc35))
+#loc196 = loc("block_n_end"(#loc36))
+#loc197 = loc("offs_n"(#loc37))
+#loc198 = loc("offs_n"(#loc38))
+#loc199 = loc("k"(#loc40))
+#loc201 = loc("m"(#loc44))
+#loc202 = loc("n"(#loc45))
+#loc203 = loc("tmp3"(#loc46))
+#loc204 = loc("tmp5"(#loc47))
+#loc205 = loc("tmp6"(#loc48))
+#loc206 = loc("tmp7"(#loc49))
+#loc207 = loc("tmp9"(#loc50))
+#loc208 = loc("tmp14"(#loc51))
+#loc209 = loc("tmp14"(#loc52))
+#loc210 = loc("tmp14"(#loc53))
+#loc211 = loc("tmp14"(#loc54))
+#loc212 = loc("tmp14"(#loc55))
+#loc213 = loc("tmp14"(#loc56))
+#loc214 = loc("tmp17"(#loc57))
+#loc215 = loc("v"(#loc58))
+#loc216 = loc("acc"(#loc59))
+#loc217 = loc("offs_n_load"(#loc60))
+#loc218 = loc("kv_base_offset"(#loc61))
+#loc219 = loc("qk"(#loc62))
+#loc220 = loc("k"(#loc63))
+#loc221 = loc("qk"(#loc64))
+#loc222 = loc("post_mod_scores"(#loc65))
+#loc223 = loc("post_mod_scores"(#loc66))
+#loc224 = loc("tmp8"(#loc67))
+#loc225 = loc("tmp10"(#loc68))
+#loc226 = loc("tmp11"(#loc69))
+#loc227 = loc("tmp16"(#loc70))
+#loc228 = loc("tmp16"(#loc71))
+#loc229 = loc("tmp16"(#loc72))
+#loc230 = loc("tmp16"(#loc73))
+#loc231 = loc("tmp16"(#loc74))
+#loc232 = loc("tmp16"(#loc75))
+#loc233 = loc("tmp18"(#loc76))
+#loc234 = loc("tmp19"(#loc77))
+#loc235 = loc("tmp20"(#loc78))
+#loc236 = loc("mask_mod_output"(#loc79))
+#loc237 = loc("post_mod_scores"(#loc80))
+#loc238 = loc("post_mod_scores"(#loc81))
+#loc240 = loc("m_ij"(#loc85))
+#loc241 = loc("masked_out_rows"(#loc86))
+#loc242 = loc("m_ij_masked"(#loc87))
+#loc243 = loc("alpha"(#loc88))
+#loc244 = loc("alpha"(#loc89))
+#loc245 = loc("p"(#loc90))
+#loc246 = loc("p"(#loc91))
+#loc247 = loc("p"(#loc92))
+#loc248 = loc("l_i"(#loc93))
+#loc250 = loc("l_i"(#loc97))
+#loc251 = loc("acc"(#loc98))
+#loc252 = loc("acc"(#loc99))
+#loc253 = loc("acc"(#loc100))
+#loc254 = loc("acc"(#loc101))
+#loc255 = loc("offs_n"(#loc102))
+#loc256 = loc("cur_block_idx"(#loc103))
+#loc257 = loc("offset"(#loc104))
+#loc258 = loc("cur_block"(#loc105))
+#loc259 = loc("cur_block"(#loc106))
+#loc260 = loc("next_block"(#loc107))
+#loc261 = loc("next_block"(#loc108))
+#loc262 = loc("next_block"(#loc109))
+#loc263 = loc("next_block"(#loc110))
+#loc264 = loc("needs_jump"(#loc111))
+#loc265 = loc("needs_jump"(#loc112))
+#loc266 = loc("needs_jump"(#loc113))
+#loc267 = loc("jump_to_block"(#loc114))
+#loc268 = loc("jump_to_block"(#loc115))
+#loc269 = loc("jump_to_block"(#loc116))
+#loc270 = loc("offset"(#loc117))
+#loc271 = loc("offset"(#loc118))
+#loc272 = loc("offset"(#loc119))
+#loc273 = loc("offset"(#loc120))
+#loc274 = loc("kv_offset"(#loc121))
+#loc275 = loc("kv_indices"(#loc122))
+#loc276 = loc("kv_start"(#loc123))
+#loc277 = loc("kv_start"(#loc124))
+#loc278 = loc("kv_num_blocks"(#loc125))
+#loc279 = loc("kv_num_blocks"(#loc126))
+#loc280 = loc("block_n_end"(#loc127))
+#loc281 = loc("block_n_end"(#loc128))
+#loc282 = loc("offs_n"(#loc129))
+#loc284 = loc("l_i"(#loc132))
+#loc285 = loc("l_i"(#loc133))
+#loc286 = loc("acc"(#loc134))
+#loc287 = loc("acc"(#loc135))
+#loc288 = loc("mask"(#loc136))
+#loc289 = loc("mask"(#loc137))
+#loc290 = loc("off_hz"(#loc142))
+#loc291 = loc("off_hz"(#loc143))
+#loc292 = loc("l_ptrs"(#loc144))
+#loc293 = loc("l_ptrs"(#loc145))
+#loc294 = loc("l_ptrs"(#loc146))
+#loc295 = loc("lse"(#loc147))
+#loc296 = loc("lse"(#loc148))
+#loc297 = loc(callsite(#loc182 at #loc183))
+#loc298 = loc(callsite(#loc184 at #loc183))
+#loc299 = loc(callsite(#loc185 at #loc183))
+#loc300 = loc(callsite(#loc186 at #loc183))
+#loc301 = loc(callsite(#loc187 at #loc183))
+#loc302 = loc(callsite(#loc24 at #loc183))
+#loc303 = loc(callsite(#loc25 at #loc183))
+#loc304 = loc(callsite(#loc32 at #loc194))
+#loc305 = loc(callsite(#loc34 at #loc194))
+#loc306 = loc(callsite(#loc199 at #loc200))
+#loc307 = loc(callsite(#loc201 at #loc200))
+#loc308 = loc(callsite(#loc202 at #loc200))
+#loc309 = loc(callsite(#loc203 at #loc200))
+#loc310 = loc(callsite(#loc204 at #loc200))
+#loc311 = loc(callsite(#loc205 at #loc200))
+#loc312 = loc(callsite(#loc206 at #loc200))
+#loc313 = loc(callsite(#loc207 at #loc200))
+#loc314 = loc(callsite(#loc208 at #loc200))
+#loc315 = loc(callsite(#loc209 at #loc200))
+#loc316 = loc(callsite(#loc210 at #loc200))
+#loc317 = loc(callsite(#loc211 at #loc200))
+#loc318 = loc(callsite(#loc212 at #loc200))
+#loc319 = loc(callsite(#loc213 at #loc200))
+#loc320 = loc(callsite(#loc214 at #loc200))
+#loc321 = loc(callsite(#loc215 at #loc200))
+#loc322 = loc("l_i"(#loc216))
+#loc323 = loc(callsite(#loc217 at #loc200))
+#loc324 = loc(callsite(#loc218 at #loc200))
+#loc325 = loc(callsite(#loc219 at #loc200))
+#loc326 = loc(callsite(#loc220 at #loc200))
+#loc327 = loc(callsite(#loc221 at #loc200))
+#loc328 = loc(callsite(#loc222 at #loc200))
+#loc329 = loc(callsite(#loc223 at #loc200))
+#loc330 = loc(callsite(#loc224 at #loc200))
+#loc331 = loc(callsite(#loc225 at #loc200))
+#loc332 = loc(callsite(#loc226 at #loc200))
+#loc333 = loc(callsite(#loc227 at #loc200))
+#loc334 = loc(callsite(#loc228 at #loc200))
+#loc335 = loc(callsite(#loc229 at #loc200))
+#loc336 = loc(callsite(#loc230 at #loc200))
+#loc337 = loc(callsite(#loc231 at #loc200))
+#loc338 = loc(callsite(#loc232 at #loc200))
+#loc339 = loc(callsite(#loc233 at #loc200))
+#loc340 = loc(callsite(#loc234 at #loc200))
+#loc341 = loc(callsite(#loc235 at #loc200))
+#loc342 = loc(callsite(#loc236 at #loc200))
+#loc343 = loc(callsite(#loc237 at #loc200))
+#loc344 = loc(callsite(#loc238 at #loc200))
+#loc346 = loc(callsite(#loc240 at #loc200))
+#loc347 = loc(callsite(#loc241 at #loc200))
+#loc348 = loc(callsite(#loc242 at #loc200))
+#loc349 = loc(callsite(#loc243 at #loc200))
+#loc350 = loc(callsite(#loc244 at #loc200))
+#loc351 = loc(callsite(#loc245 at #loc200))
+#loc352 = loc(callsite(#loc246 at #loc200))
+#loc353 = loc(callsite(#loc247 at #loc200))
+#loc354 = loc(callsite(#loc248 at #loc200))
+#loc356 = loc(callsite(#loc250 at #loc200))
+#loc357 = loc(callsite(#loc251 at #loc200))
+#loc358 = loc(callsite(#loc252 at #loc200))
+#loc359 = loc(callsite(#loc253 at #loc200))
+#loc360 = loc(callsite(#loc254 at #loc200))
+#loc361 = loc(callsite(#loc255 at #loc42))
+#loc362 = loc(callsite(#loc257 at #loc42))
+#loc363 = loc(callsite(#loc274 at #loc42))
+#loc364 = loc(callsite(#loc199 at #loc283))
+#loc365 = loc(callsite(#loc215 at #loc283))
+#loc366 = loc(callsite(#loc217 at #loc283))
+#loc367 = loc(callsite(#loc218 at #loc283))
+#loc368 = loc(callsite(#loc219 at #loc283))
+#loc369 = loc(callsite(#loc220 at #loc283))
+#loc370 = loc(callsite(#loc221 at #loc283))
+#loc371 = loc(callsite(#loc222 at #loc283))
+#loc372 = loc(callsite(#loc223 at #loc283))
+#loc373 = loc(callsite(#loc238 at #loc283))
+#loc375 = loc(callsite(#loc240 at #loc283))
+#loc376 = loc(callsite(#loc241 at #loc283))
+#loc377 = loc(callsite(#loc242 at #loc283))
+#loc378 = loc(callsite(#loc243 at #loc283))
+#loc379 = loc(callsite(#loc244 at #loc283))
+#loc380 = loc(callsite(#loc245 at #loc283))
+#loc381 = loc(callsite(#loc246 at #loc283))
+#loc382 = loc(callsite(#loc247 at #loc283))
+#loc383 = loc(callsite(#loc248 at #loc283))
+#loc385 = loc(callsite(#loc250 at #loc283))
+#loc386 = loc(callsite(#loc251 at #loc283))
+#loc387 = loc(callsite(#loc252 at #loc283))
+#loc388 = loc(callsite(#loc253 at #loc283))
+#loc389 = loc(callsite(#loc254 at #loc283))
+#loc390 = loc(callsite(#loc255 at #loc131))
+#loc391 = loc(callsite(#loc257 at #loc131))
+#loc392 = loc(callsite(#loc274 at #loc131))
+#loc393 = loc(callsite(#loc185 at #loc306))
+#loc394 = loc(callsite(#loc187 at #loc306))
+#loc395 = loc(callsite(#loc24 at #loc306))
+#loc396 = loc(callsite(#loc43 at #loc307))
+#loc397 = loc(callsite(#loc43 at #loc308))
+#loc398 = loc(callsite(#loc185 at #loc321))
+#loc399 = loc(callsite(#loc25 at #loc306))
+#loc400 = loc(callsite(#loc25 at #loc321))
+#loc401 = loc("m_i"(#loc322))
+#loc402 = loc(callsite(#loc182 at #loc306))
+#loc403 = loc(callsite(#loc184 at #loc306))
+#loc404 = loc(callsite(#loc187 at #loc321))
+#loc405 = loc(callsite(#loc82 at #loc345))
+#loc407 = loc(callsite(#loc94 at #loc355))
+#loc409 = loc(callsite(#loc256 at #loc362))
+#loc410 = loc(callsite(#loc258 at #loc362))
+#loc411 = loc(callsite(#loc259 at #loc362))
+#loc412 = loc(callsite(#loc260 at #loc362))
+#loc413 = loc(callsite(#loc261 at #loc362))
+#loc414 = loc(callsite(#loc262 at #loc362))
+#loc415 = loc(callsite(#loc263 at #loc362))
+#loc416 = loc(callsite(#loc264 at #loc362))
+#loc417 = loc(callsite(#loc265 at #loc362))
+#loc418 = loc(callsite(#loc266 at #loc362))
+#loc419 = loc(callsite(#loc267 at #loc362))
+#loc420 = loc(callsite(#loc268 at #loc362))
+#loc421 = loc(callsite(#loc269 at #loc362))
+#loc422 = loc(callsite(#loc270 at #loc362))
+#loc423 = loc(callsite(#loc271 at #loc362))
+#loc424 = loc(callsite(#loc272 at #loc362))
+#loc425 = loc(callsite(#loc273 at #loc362))
+#loc426 = loc(callsite(#loc25 at #loc364))
+#loc427 = loc(callsite(#loc25 at #loc365))
+#loc428 = loc(callsite(#loc182 at #loc364))
+#loc429 = loc(callsite(#loc184 at #loc364))
+#loc430 = loc(callsite(#loc185 at #loc364))
+#loc431 = loc(callsite(#loc187 at #loc364))
+#loc432 = loc(callsite(#loc24 at #loc364))
+#loc433 = loc(callsite(#loc185 at #loc365))
+#loc434 = loc(callsite(#loc187 at #loc365))
+#loc435 = loc(callsite(#loc82 at #loc374))
+#loc437 = loc(callsite(#loc94 at #loc384))
+#loc439 = loc(callsite(#loc256 at #loc391))
+#loc440 = loc(callsite(#loc258 at #loc391))
+#loc441 = loc(callsite(#loc259 at #loc391))
+#loc442 = loc(callsite(#loc260 at #loc391))
+#loc443 = loc(callsite(#loc261 at #loc391))
+#loc444 = loc(callsite(#loc262 at #loc391))
+#loc445 = loc(callsite(#loc263 at #loc391))
+#loc446 = loc(callsite(#loc264 at #loc391))
+#loc447 = loc(callsite(#loc265 at #loc391))
+#loc448 = loc(callsite(#loc266 at #loc391))
+#loc449 = loc(callsite(#loc267 at #loc391))
+#loc450 = loc(callsite(#loc268 at #loc391))
+#loc451 = loc(callsite(#loc269 at #loc391))
+#loc452 = loc(callsite(#loc270 at #loc391))
+#loc453 = loc(callsite(#loc271 at #loc391))
+#loc454 = loc(callsite(#loc272 at #loc391))
+#loc455 = loc(callsite(#loc273 at #loc391))
+#loc456 = loc("offs_n"(#loc401))
+#loc457 = loc(callsite(#loc84 at #loc405))
+#loc458 = loc(callsite(#loc96 at #loc407))
+#loc459 = loc(callsite(#loc84 at #loc435))
+#loc460 = loc(callsite(#loc96 at #loc437))
+#loc461 = loc("kv_offset"(#loc456))
+#loc462 = loc(callsite(#loc461 at #loc42))
+#loc463 = loc(callsite(#loc461 at #loc131))

progress/github/SpecForge/cache/compiled_kernels/triton/1/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir ADDED Viewed

	@@ -0,0 +1,780 @@

+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":18:0)
+#loc1 = loc(unknown)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":172:41)
+#loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":520:16)
+#loc87 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":403:51)
+#loc99 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":416:34)
+#loc137 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":198:45)
+#loc156 = loc("arg_Q"(#loc))
+#loc157 = loc("arg_K"(#loc))
+#loc158 = loc("arg_V"(#loc))
+#loc159 = loc("arg_LSE"(#loc))
+#loc160 = loc("arg_MAX"(#loc))
+#loc161 = loc("arg_KV_NUM_BLKS"(#loc))
+#loc162 = loc("arg_KV_IDX"(#loc))
+#loc163 = loc("arg_FULL_KV_NUM_BLKS"(#loc))
+#loc164 = loc("arg_FULL_KV_IDX"(#loc))
+#loc165 = loc("out_ptr0"(#loc))
+#loc166 = loc("ks0"(#loc))
+#loc167 = loc("ks1"(#loc))
+#loc168 = loc("ks2"(#loc))
+#loc169 = loc("ks3"(#loc))
+#loc170 = loc("ks4"(#loc))
+#loc210 = loc(callsite(#loc48 at #loc2))
+#loc247 = loc("m_ij"(#loc87))
+#loc257 = loc("l_i"(#loc99))
+#loc293 = loc(callsite(#loc48 at #loc137))
+#loc354 = loc(callsite(#loc247 at #loc210))
+#loc364 = loc(callsite(#loc257 at #loc210))
+#loc383 = loc(callsite(#loc247 at #loc293))
+#loc393 = loc(callsite(#loc257 at #loc293))
+#loc413 = loc(callsite(#loc1 at #loc354))
+#loc415 = loc(callsite(#loc1 at #loc364))
+#loc443 = loc(callsite(#loc1 at #loc383))
+#loc445 = loc(callsite(#loc1 at #loc393))
+module {
+  tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<1024> : tensor<64x1xi32> loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1)
+    %cst_1 = arith.constant dense<16> : tensor<1x64xi32> loc(#loc171)
+    %cst_2 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc171)
+    %cst_3 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1)
+    %cst_5 = arith.constant dense<false> : tensor<128x64xi1> loc(#loc171)
+    %cst_6 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc171)
+    %cst_7 = arith.constant dense<1> : tensor<128x1xi32> loc(#loc171)
+    %cst_8 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc171)
+    %cst_9 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc171)
+    %cst_10 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1)
+    %cst_11 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1)
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1)
+    %c63_i32 = arith.constant 63 : i32 loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %q = arith.constant dense<0.000000e+00> : tensor<128x128xbf16> loc(#loc306)
+    %acc = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc307)
+    %cst_13 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1)
+    %mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc174)
+    %l_i = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc175)
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1)
+    %c2_i32 = arith.constant 2 : i32 loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %HQ = arith.constant 32 : i32 loc(#loc176)
+    %c1_i32 = arith.constant 1 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc10)
+    %q_start = tt.get_program_id x : i32 loc(#loc177)
+    %off_zq = tt.get_program_id y : i32 loc(#loc178)
+    %off_hq = tt.get_program_id z : i32 loc(#loc179)
+    %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc180)
+    %q_offset = arith.muli %off_zq, %0 : i32 loc(#loc181)
+    %q_offset_15 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc182)
+    %q_offset_16 = arith.addi %q_offset, %q_offset_15 : i32 loc(#loc183)
+    %k_offset = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc184)
+    %Q = tt.addptr %arg_Q, %q_offset_16 : !tt.ptr<bf16>, i32 loc(#loc185)
+    %K = tt.addptr %arg_K, %k_offset : !tt.ptr<bf16>, i32 loc(#loc186)
+    %V = tt.addptr %arg_V, %k_offset : !tt.ptr<bf16>, i32 loc(#loc187)
+    %sparse_kv_idx_offset = arith.muli %q_start, %ks4 : i32 loc(#loc188)
+    %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc189)
+    %offs_m_17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc190)
+    %offs_m_18 = tt.splat %offs_m : i32 -> tensor<128xi32> loc(#loc191)
+    %offs_m_19 = arith.addi %offs_m_18, %offs_m_17 : tensor<128xi32> loc(#loc191)
+    %ptr = tt.expand_dims %offs_m_19 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc308)
+    %ptr_20 = arith.muli %ptr, %cst_13 : tensor<128x1xi32> loc(#loc309)
+    %ptr_21 = tt.splat %Q : !tt.ptr<bf16> -> tensor<128x1x!tt.ptr<bf16>> loc(#loc310)
+    %ptr_22 = tt.addptr %ptr_21, %ptr_20 : tensor<128x1x!tt.ptr<bf16>>, tensor<128x1xi32> loc(#loc310)
+    %ptr_23 = tt.expand_dims %offs_m_17 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc311)
+    %ptr_24 = tt.broadcast %ptr_22 : tensor<128x1x!tt.ptr<bf16>> -> tensor<128x128x!tt.ptr<bf16>> loc(#loc312)
+    %ptr_25 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc312)
+    %ptr_26 = tt.addptr %ptr_24, %ptr_25 : tensor<128x128x!tt.ptr<bf16>>, tensor<128x128xi32> loc(#loc312)
+    %q_27 = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc313)
+    %q_28 = arith.cmpi slt, %ptr, %q_27 : tensor<128x1xi32> loc(#loc313)
+    %q_29 = tt.broadcast %q_28 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc306)
+    %q_30 = tt.load %ptr_26, %q_29, %q : tensor<128x128x!tt.ptr<bf16>> loc(#loc306)
+    %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc197)
+    %kv_start = tt.load %kv_indices : !tt.ptr<i32> loc(#loc198)
+    %kv_start_31 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc199)
+    %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc200)
+    %kv_num_blocks_32 = tt.load %kv_num_blocks : !tt.ptr<i32> loc(#loc201)
+    %block_n_end = arith.muli %kv_num_blocks_32, %c2_i32 : i32 loc(#loc202)
+    %block_n_end_33 = arith.addi %ks1, %c63_i32 : i32 loc(#loc314)
+    %block_n_end_34 = arith.divsi %block_n_end_33, %c64_i32 : i32 loc(#loc315)
+    %block_n_end_35 = arith.maxsi %block_n_end_34, %c1_i32 : i32 loc(#loc204)
+    %block_n_end_36 = arith.minsi %block_n_end, %block_n_end_35 : i32 loc(#loc205)
+    %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc206)
+    %offs_n_37 = tt.splat %kv_start_31 : i32 -> tensor<64xi32> loc(#loc207)
+    %offs_n_38 = arith.addi %offs_n_37, %offs_n : tensor<64xi32> loc(#loc207)
+    %1 = tt.expand_dims %offs_n_38 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc45)
+    %kv_offset:5 = scf.for %start_n = %c0_i32 to %block_n_end_36 step %c1_i32 iter_args(%acc_62 = %acc, %l_i_63 = %cst_14, %m_i = %cst_3, %offs_n_64 = %1, %kv_offset_65 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32)  : i32 {
+      %kv_base_offset = arith.addi %kv_start_31, %kv_offset_65 : i32 loc(#loc317)
+      %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc318)
+      %offs_n_load_66 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc318)
+      %ptr_67 = tt.expand_dims %offs_n_load_66 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc404)
+      %ptr_68 = arith.muli %ptr_67, %cst : tensor<64x1xi32> loc(#loc405)
+      %ptr_69 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc406)
+      %ptr_70 = tt.addptr %ptr_69, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc406)
+      %ptr_71 = tt.broadcast %ptr_70 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc407)
+      %ptr_72 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc407)
+      %ptr_73 = tt.addptr %ptr_71, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc407)
+      %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc408)
+      %k_74 = arith.cmpi slt, %ptr_67, %k : tensor<64x1xi32> loc(#loc408)
+      %k_75 = tt.broadcast %k_74 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc409)
+      %k_76 = tt.load %ptr_73, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc409)
+      %k_77 = tt.trans %k_76 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc320)
+      %qk = tt.dot %q_30, %k_77, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc321)
+      %qk_78 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc322)
+      %m = arith.remsi %ptr, %q_27 : tensor<128x1xi32> loc(#loc410)
+      %n = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc411)
+      %n_79 = arith.remsi %offs_n_64, %n : tensor<1x64xi32> loc(#loc411)
+      %post_mod_scores = arith.cmpi slt, %offs_n_64, %n : tensor<1x64xi32> loc(#loc325)
+      %post_mod_scores_80 = tt.broadcast %post_mod_scores : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc326)
+      %post_mod_scores_81 = arith.select %post_mod_scores_80, %qk_78, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc326)
+      %tmp3 = arith.cmpi slt, %m, %cst_8 : tensor<128x1xi32> loc(#loc327)
+      %tmp5 = tt.broadcast %n_79 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc328)
+      %tmp5_82 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc328)
+      %tmp5_83 = arith.cmpi sle, %tmp5, %tmp5_82 : tensor<128x64xi32> loc(#loc328)
+      %tmp6 = tt.broadcast %tmp3 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc329)
+      %tmp6_84 = arith.andi %tmp6, %tmp5_83 : tensor<128x64xi1> loc(#loc329)
+      %tmp7 = arith.cmpi sge, %m, %cst_8 : tensor<128x1xi32> loc(#loc330)
+      %tmp8 = arith.cmpi slt, %n_79, %cst_9 : tensor<1x64xi32> loc(#loc331)
+      %tmp9 = tt.broadcast %tmp7 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc332)
+      %tmp9_85 = tt.broadcast %tmp8 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc332)
+      %tmp9_86 = arith.andi %tmp9, %tmp9_85 : tensor<128x64xi1> loc(#loc332)
+      %tmp10 = arith.extui %tmp8 : tensor<1x64xi1> to tensor<1x64xi32> loc(#loc333)
+      %tmp10_87 = arith.cmpi eq, %tmp10, %cst_9 : tensor<1x64xi32> loc(#loc333)
+      %tmp11 = tt.broadcast %tmp10_87 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc334)
+      %tmp11_88 = arith.andi %tmp9, %tmp11 : tensor<128x64xi1> loc(#loc334)
+      %tmp14 = arith.remsi %m, %cst_2 : tensor<128x1xi32> loc(#loc335)
+      %tmp14_89 = arith.cmpi ne, %tmp14, %cst_8 : tensor<128x1xi32> loc(#loc336)
+      %tmp14_90 = arith.divsi %m, %cst_2 : tensor<128x1xi32> loc(#loc337)
+      %tmp14_91 = arith.subi %tmp14_90, %cst_7 : tensor<128x1xi32> loc(#loc338)
+      %tmp14_92 = arith.select %tmp14_89, %tmp14_91, %tmp14_90 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc339)
+      %tmp14_93 = arith.select %tmp3, %tmp14_92, %tmp14_90 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc340)
+      %tmp16 = arith.remsi %n_79, %cst_1 : tensor<1x64xi32> loc(#loc341)
+      %tmp16_94 = arith.cmpi ne, %tmp16, %cst_9 : tensor<1x64xi32> loc(#loc342)
+      %tmp16_95 = arith.divsi %n_79, %cst_1 : tensor<1x64xi32> loc(#loc343)
+      %tmp16_96 = arith.subi %tmp16_95, %cst_6 : tensor<1x64xi32> loc(#loc344)
+      %tmp16_97 = arith.select %tmp16_94, %tmp16_96, %tmp16_95 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc345)
+      %tmp16_98 = arith.select %tmp8, %tmp16_97, %tmp16_95 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc346)
+      %tmp17 = tt.broadcast %tmp14_93 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc347)
+      %tmp17_99 = tt.broadcast %tmp16_98 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc347)
+      %tmp17_100 = arith.cmpi eq, %tmp17, %tmp17_99 : tensor<128x64xi32> loc(#loc347)
+      %tmp18 = arith.andi %tmp11_88, %tmp17_100 : tensor<128x64xi1> loc(#loc348)
+      %tmp19 = arith.ori %tmp9_86, %tmp18 : tensor<128x64xi1> loc(#loc349)
+      %tmp20 = arith.ori %tmp6_84, %tmp19 : tensor<128x64xi1> loc(#loc350)
+      %mask_mod_output = arith.select %post_mod_scores_80, %tmp20, %cst_5 : tensor<128x64xi1>, tensor<128x64xi1> loc(#loc351)
+      %post_mod_scores_101 = arith.select %mask_mod_output, %post_mod_scores_81, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc352)
+      %post_mod_scores_102 = arith.mulf %post_mod_scores_101, %cst_4 : tensor<128x64xf32> loc(#loc353)
+      %m_ij = "tt.reduce"(%post_mod_scores_102) <{axis = 1 : i32}> ({
+      ^bb0(%m_ij_135: f32 loc(callsite(#loc1 at #loc354)), %m_ij_136: f32 loc(callsite(#loc1 at #loc354))):
+        %m_ij_137 = arith.maxnumf %m_ij_135, %m_ij_136 : f32 loc(#loc467)
+        tt.reduce.return %m_ij_137 : f32 loc(#loc412)
+      }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc412)
+      %m_ij_103 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc355)
+      %masked_out_rows = arith.cmpf oeq, %m_ij_103, %cst_3 : tensor<128xf32> loc(#loc356)
+      %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_103 : tensor<128xi1>, tensor<128xf32> loc(#loc357)
+      %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc358)
+      %alpha_104 = math.exp2 %alpha : tensor<128xf32> loc(#loc359)
+      %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc360)
+      %p_105 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc361)
+      %p_106 = arith.subf %post_mod_scores_102, %p_105 : tensor<128x64xf32> loc(#loc361)
+      %p_107 = math.exp2 %p_106 : tensor<128x64xf32> loc(#loc362)
+      %l_i_108 = arith.mulf %l_i_63, %alpha_104 : tensor<128xf32> loc(#loc363)
+      %l_i_109 = "tt.reduce"(%p_107) <{axis = 1 : i32}> ({
+      ^bb0(%l_i_135: f32 loc(callsite(#loc1 at #loc364)), %l_i_136: f32 loc(callsite(#loc1 at #loc364))):
+        %l_i_137 = arith.addf %l_i_135, %l_i_136 : f32 loc(#loc468)
+        tt.reduce.return %l_i_137 : f32 loc(#loc414)
+      }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc414)
+      %l_i_110 = arith.addf %l_i_108, %l_i_109 : tensor<128xf32> loc(#loc365)
+      %acc_111 = tt.expand_dims %alpha_104 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc366)
+      %acc_112 = tt.broadcast %acc_111 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc367)
+      %acc_113 = arith.mulf %acc_62, %acc_112 : tensor<128x128xf32> loc(#loc367)
+      %ptr_114 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc416)
+      %ptr_115 = tt.addptr %ptr_114, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc416)
+      %ptr_116 = tt.broadcast %ptr_115 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc417)
+      %ptr_117 = tt.addptr %ptr_116, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc417)
+      %v = tt.load %ptr_117, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc418)
+      %acc_118 = arith.truncf %p_107 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc369)
+      %acc_119 = tt.dot %acc_118, %v, %acc_113, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc370)
+      %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc419)
+      %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc420)
+      %cur_block_120 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc421)
+      %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc422)
+      %next_block_121 = arith.cmpi slt, %next_block, %kv_num_blocks_32 : i32 loc(#loc423)
+      %next_block_122 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc424)
+      %next_block_123 = tt.load %next_block_122, %next_block_121 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc425)
+      %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc426)
+      %needs_jump_124 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc427)
+      %needs_jump_125 = arith.cmpi eq, %needs_jump_124, %c0_i32 : i32 loc(#loc428)
+      %jump_to_block = arith.subi %next_block_123, %cur_block_120 : i32 loc(#loc429)
+      %jump_to_block_126 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc430)
+      %jump_to_block_127 = arith.subi %jump_to_block_126, %c64_i32 : i32 loc(#loc431)
+      %offset = arith.extui %needs_jump_125 : i1 to i32 loc(#loc432)
+      %offset_128 = arith.muli %jump_to_block_127, %offset : i32 loc(#loc432)
+      %offset_129 = arith.subi %c1_i32, %offset : i32 loc(#loc433)
+      %offset_130 = arith.muli %offset_129, %c64_i32 : i32 loc(#loc434)
+      %offset_131 = arith.addi %offset_128, %offset_130 : i32 loc(#loc435)
+      %offs_n_132 = tt.splat %offset_131 : i32 -> tensor<1x64xi32> loc(#loc372)
+      %offs_n_133 = arith.addi %offs_n_64, %offs_n_132 : tensor<1x64xi32> loc(#loc372)
+      %kv_offset_134 = arith.addi %kv_offset_65, %offset_131 : i32 loc(#loc373)
+      scf.yield %acc_119, %l_i_110, %m_ij_103, %offs_n_133, %kv_offset_134 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc284)
+    } loc(#loc472)
+    %kv_indices_39 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc285)
+    %kv_start_40 = tt.load %kv_indices_39 : !tt.ptr<i32> loc(#loc286)
+    %kv_start_41 = arith.muli %kv_start_40, %c128_i32 : i32 loc(#loc287)
+    %kv_num_blocks_42 = tt.addptr %arg_FULL_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc288)
+    %kv_num_blocks_43 = tt.load %kv_num_blocks_42 : !tt.ptr<i32> loc(#loc289)
+    %block_n_end_44 = arith.muli %kv_num_blocks_43, %c2_i32 : i32 loc(#loc290)
+    %block_n_end_45 = arith.minsi %block_n_end_44, %block_n_end_35 : i32 loc(#loc291)
+    %offs_n_46 = tt.splat %kv_start_41 : i32 -> tensor<64xi32> loc(#loc292)
+    %offs_n_47 = arith.addi %offs_n_46, %offs_n : tensor<64xi32> loc(#loc292)
+    %2 = tt.expand_dims %offs_n_47 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc136)
+    %kv_offset_48:5 = scf.for %start_n = %c0_i32 to %block_n_end_45 step %c1_i32 iter_args(%acc_62 = %kv_offset#0, %l_i_63 = %kv_offset#1, %m_i = %kv_offset#2, %offs_n_64 = %2, %kv_offset_65 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32)  : i32 {
+      %kv_base_offset = arith.addi %kv_start_41, %kv_offset_65 : i32 loc(#loc374)
+      %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc375)
+      %offs_n_load_66 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc375)
+      %ptr_67 = tt.expand_dims %offs_n_load_66 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc436)
+      %ptr_68 = arith.muli %ptr_67, %cst : tensor<64x1xi32> loc(#loc437)
+      %ptr_69 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc438)
+      %ptr_70 = tt.addptr %ptr_69, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc438)
+      %ptr_71 = tt.broadcast %ptr_70 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc439)
+      %ptr_72 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc439)
+      %ptr_73 = tt.addptr %ptr_71, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc439)
+      %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc440)
+      %k_74 = arith.cmpi slt, %ptr_67, %k : tensor<64x1xi32> loc(#loc440)
+      %k_75 = tt.broadcast %k_74 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc441)
+      %k_76 = tt.load %ptr_73, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc441)
+      %k_77 = tt.trans %k_76 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc377)
+      %qk = tt.dot %q_30, %k_77, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc378)
+      %qk_78 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc379)
+      %post_mod_scores = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc380)
+      %post_mod_scores_79 = arith.cmpi slt, %offs_n_64, %post_mod_scores : tensor<1x64xi32> loc(#loc380)
+      %post_mod_scores_80 = tt.broadcast %post_mod_scores_79 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc381)
+      %post_mod_scores_81 = arith.select %post_mod_scores_80, %qk_78, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc381)
+      %post_mod_scores_82 = arith.mulf %post_mod_scores_81, %cst_4 : tensor<128x64xf32> loc(#loc382)
+      %m_ij = "tt.reduce"(%post_mod_scores_82) <{axis = 1 : i32}> ({
+      ^bb0(%m_ij_115: f32 loc(callsite(#loc1 at #loc383)), %m_ij_116: f32 loc(callsite(#loc1 at #loc383))):
+        %m_ij_117 = arith.maxnumf %m_ij_115, %m_ij_116 : f32 loc(#loc469)
+        tt.reduce.return %m_ij_117 : f32 loc(#loc442)
+      }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc442)
+      %m_ij_83 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc384)
+      %masked_out_rows = arith.cmpf oeq, %m_ij_83, %cst_3 : tensor<128xf32> loc(#loc385)
+      %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_83 : tensor<128xi1>, tensor<128xf32> loc(#loc386)
+      %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc387)
+      %alpha_84 = math.exp2 %alpha : tensor<128xf32> loc(#loc388)
+      %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc389)
+      %p_85 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc390)
+      %p_86 = arith.subf %post_mod_scores_82, %p_85 : tensor<128x64xf32> loc(#loc390)
+      %p_87 = math.exp2 %p_86 : tensor<128x64xf32> loc(#loc391)
+      %l_i_88 = arith.mulf %l_i_63, %alpha_84 : tensor<128xf32> loc(#loc392)
+      %l_i_89 = "tt.reduce"(%p_87) <{axis = 1 : i32}> ({
+      ^bb0(%l_i_115: f32 loc(callsite(#loc1 at #loc393)), %l_i_116: f32 loc(callsite(#loc1 at #loc393))):
+        %l_i_117 = arith.addf %l_i_115, %l_i_116 : f32 loc(#loc470)
+        tt.reduce.return %l_i_117 : f32 loc(#loc444)
+      }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc444)
+      %l_i_90 = arith.addf %l_i_88, %l_i_89 : tensor<128xf32> loc(#loc394)
+      %acc_91 = tt.expand_dims %alpha_84 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc395)
+      %acc_92 = tt.broadcast %acc_91 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc396)
+      %acc_93 = arith.mulf %acc_62, %acc_92 : tensor<128x128xf32> loc(#loc396)
+      %ptr_94 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc446)
+      %ptr_95 = tt.addptr %ptr_94, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc446)
+      %ptr_96 = tt.broadcast %ptr_95 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc447)
+      %ptr_97 = tt.addptr %ptr_96, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc447)
+      %v = tt.load %ptr_97, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc448)
+      %acc_98 = arith.truncf %p_87 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc398)
+      %acc_99 = tt.dot %acc_98, %v, %acc_93, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc399)
+      %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc449)
+      %cur_block = tt.addptr %kv_indices_39, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc450)
+      %cur_block_100 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc451)
+      %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc452)
+      %next_block_101 = arith.cmpi slt, %next_block, %kv_num_blocks_43 : i32 loc(#loc453)
+      %next_block_102 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc454)
+      %next_block_103 = tt.load %next_block_102, %next_block_101 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc455)
+      %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc456)
+      %needs_jump_104 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc457)
+      %needs_jump_105 = arith.cmpi eq, %needs_jump_104, %c0_i32 : i32 loc(#loc458)
+      %jump_to_block = arith.subi %next_block_103, %cur_block_100 : i32 loc(#loc459)
+      %jump_to_block_106 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc460)
+      %jump_to_block_107 = arith.subi %jump_to_block_106, %c64_i32 : i32 loc(#loc461)
+      %offset = arith.extui %needs_jump_105 : i1 to i32 loc(#loc462)
+      %offset_108 = arith.muli %jump_to_block_107, %offset : i32 loc(#loc462)
+      %offset_109 = arith.subi %c1_i32, %offset : i32 loc(#loc463)
+      %offset_110 = arith.muli %offset_109, %c64_i32 : i32 loc(#loc464)
+      %offset_111 = arith.addi %offset_108, %offset_110 : i32 loc(#loc465)
+      %offs_n_112 = tt.splat %offset_111 : i32 -> tensor<1x64xi32> loc(#loc401)
+      %offs_n_113 = arith.addi %offs_n_64, %offs_n_112 : tensor<1x64xi32> loc(#loc401)
+      %kv_offset_114 = arith.addi %kv_offset_65, %offset_111 : i32 loc(#loc402)
+      scf.yield %acc_99, %l_i_90, %m_ij_83, %offs_n_113, %kv_offset_114 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc294)
+    } loc(#loc473)
+    %l_i_49 = arith.cmpf oeq, %kv_offset_48#1, %cst_14 : tensor<128xf32> loc(#loc295)
+    %l_i_50 = arith.select %l_i_49, %l_i, %kv_offset_48#1 : tensor<128xi1>, tensor<128xf32> loc(#loc175)
+    %acc_51 = tt.expand_dims %l_i_50 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc296)
+    %acc_52 = tt.broadcast %acc_51 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc297)
+    %acc_53 = arith.divf %kv_offset_48#0, %acc_52 : tensor<128x128xf32> loc(#loc297)
+    %mask_54 = arith.cmpi slt, %ptr_23, %mask : tensor<1x128xi32> loc(#loc174)
+    %mask_55 = tt.broadcast %mask_54 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc298)
+    %mask_56 = arith.andi %q_29, %mask_55 : tensor<128x128xi1> loc(#loc298)
+    %3 = tt.splat %q_offset_15 : i32 -> tensor<1x128xi32> loc(#loc142)
+    %4 = arith.addi %ptr_23, %3 : tensor<1x128xi32> loc(#loc142)
+    %5 = tt.broadcast %4 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc143)
+    %6 = tt.broadcast %ptr_20 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc143)
+    %7 = arith.addi %5, %6 : tensor<128x128xi32> loc(#loc143)
+    %8 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<128x128x!tt.ptr<bf16>> loc(#loc144)
+    %9 = tt.addptr %8, %7 : tensor<128x128x!tt.ptr<bf16>>, tensor<128x128xi32> loc(#loc144)
+    %10 = arith.truncf %acc_53 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc145)
+    tt.store %9, %10, %mask_56 : tensor<128x128x!tt.ptr<bf16>> loc(#loc145)
+    %off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc299)
+    %off_hz_57 = arith.addi %off_hz, %off_hq : i32 loc(#loc300)
+    %l_ptrs = arith.muli %off_hz_57, %ks0 : i32 loc(#loc301)
+    %l_ptrs_58 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr<f32>, i32 loc(#loc302)
+    %l_ptrs_59 = tt.splat %l_ptrs_58 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc303)
+    %l_ptrs_60 = tt.addptr %l_ptrs_59, %offs_m_19 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc303)
+    %lse = math.log2 %l_i_50 : tensor<128xf32> loc(#loc304)
+    %lse_61 = arith.addf %kv_offset_48#2, %lse : tensor<128xf32> loc(#loc305)
+    %11 = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc153)
+    %12 = arith.cmpi slt, %offs_m_19, %11 : tensor<128xi32> loc(#loc153)
+    tt.store %l_ptrs_60, %lse_61, %12 : tensor<128x!tt.ptr<f32>> loc(#loc154)
+    tt.return loc(#loc155)
+  } loc(#loc)
+} loc(#loc)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":292:23)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":146:101)
+#loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":136:19)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":214:38)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":206:34)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":90:9)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":85:54)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":97:28)
+#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":98:27)
+#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":99:27)
+#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":104:24)
+#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":107:24)
+#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":107:45)
+#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":107:36)
+#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":108:47)
+#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":111:12)
+#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":112:12)
+#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":113:12)
+#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":143:97)
+#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":144:23)
+#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":144:46)
+#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":144:33)
+#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":284:27)
+#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":284:38)
+#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":284:20)
+#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":284:56)
+#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":284:49)
+#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":292:52)
+#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":151:26)
+#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":152:23)
+#loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":152:37)
+#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":153:42)
+#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":153:28)
+#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":154:45)
+#loc38 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22)
+#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":154:92)
+#loc40 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28)
+#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":154:102)
+#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":154:65)
+#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":159:37)
+#loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":159:24)
+#loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":167:48)
+#loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":484:40)
+#loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":342:32)
+#loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":346:35)
+#loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":347:107)
+#loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":349:17)
+#loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":351:19)
+#loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":353:14)
+#loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":257:21)
+#loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":358:36)
+#loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":359:36)
+#loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":367:44)
+#loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":367:69)
+#loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":372:22)
+#loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":374:23)
+#loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":375:22)
+#loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":376:23)
+#loc63 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":377:22)
+#loc64 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":378:22)
+#loc65 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":379:24)
+#loc66 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":380:23)
+#loc67 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:70)
+#loc68 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:79)
+#loc69 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:91)
+#loc70 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:99)
+#loc71 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:102)
+#loc72 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":383:119)
+#loc73 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:70)
+#loc74 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:79)
+#loc75 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:91)
+#loc76 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:99)
+#loc77 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:102)
+#loc78 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":385:119)
+#loc79 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":386:25)
+#loc80 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":387:24)
+#loc81 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":388:23)
+#loc82 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":389:23)
+#loc83 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":394:73)
+#loc84 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":396:69)
+#loc85 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":399:27)
+#loc86 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40)
+#loc88 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27)
+#loc89 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":403:27)
+#loc90 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":405:35)
+#loc91 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":406:51)
+#loc92 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":410:31)
+#loc93 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":410:25)
+#loc94 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":411:51)
+#loc95 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":411:39)
+#loc96 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":411:21)
+#loc97 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":416:16)
+#loc98 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
+#loc100 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
+#loc101 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":416:24)
+#loc102 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":418:22)
+#loc103 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":418:16)
+#loc104 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":421:107)
+#loc105 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":422:22)
+#loc106 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":422:44)
+#loc107 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":247:33)
+#loc108 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":527:63)
+#loc109 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":248:38)
+#loc110 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":248:24)
+#loc111 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":249:109)
+#loc112 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":249:113)
+#loc113 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":249:55)
+#loc114 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":249:25)
+#loc115 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":250:30)
+#loc116 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":250:35)
+#loc117 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":250:60)
+#loc118 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":251:34)
+#loc119 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":251:48)
+#loc120 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":251:63)
+#loc121 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":252:29)
+#loc122 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":252:47)
+#loc123 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":252:61)
+#loc124 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":252:42)
+#loc125 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":530:26)
+#loc126 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":531:21)
+#loc127 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":531:8)
+#loc128 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":181:35)
+#loc129 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":182:27)
+#loc130 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":182:41)
+#loc131 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":183:51)
+#loc132 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":183:32)
+#loc133 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":184:49)
+#loc134 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":184:69)
+#loc135 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":186:28)
+#loc136 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":193:52)
+#loc138 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":206:26)
+#loc139 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":208:20)
+#loc140 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":208:16)
+#loc141 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":214:30)
+#loc142 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":218:49)
+#loc143 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":218:62)
+#loc144 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":218:25)
+#loc145 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":218:92)
+#loc146 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":221:26)
+#loc147 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":221:31)
+#loc148 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":222:32)
+#loc149 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":222:23)
+#loc150 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":222:40)
+#loc151 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":223:33)
+#loc152 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":223:20)
+#loc153 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":227:48)
+#loc154 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":227:29)
+#loc155 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/vi/cviuncbgf6gw2ilej3jwbvnvnhxltsoxsxysnqbw2r6nrbtpxkk5.py":229:4)
+#loc171 = loc(callsite(#loc1 at #loc2))
+#loc172 = loc("q"(#loc4))
+#loc173 = loc("acc"(#loc6))
+#loc174 = loc("mask"(#loc7))
+#loc175 = loc("l_i"(#loc8))
+#loc176 = loc("HQ"(#loc9))
+#loc177 = loc("q_start"(#loc11))
+#loc178 = loc("off_zq"(#loc12))
+#loc179 = loc("off_hq"(#loc13))
+#loc180 = loc("off_hkv"(#loc14))
+#loc181 = loc("q_offset"(#loc15))
+#loc182 = loc("q_offset"(#loc16))
+#loc183 = loc("q_offset"(#loc17))
+#loc184 = loc("k_offset"(#loc18))
+#loc185 = loc("Q"(#loc19))
+#loc186 = loc("K"(#loc20))
+#loc187 = loc("V"(#loc21))
+#loc188 = loc("sparse_kv_idx_offset"(#loc22))
+#loc189 = loc("offs_m"(#loc23))
+#loc190 = loc("offs_m"(#loc24))
+#loc191 = loc("offs_m"(#loc25))
+#loc192 = loc("ptr"(#loc26))
+#loc193 = loc("ptr"(#loc27))
+#loc194 = loc("ptr"(#loc28))
+#loc195 = loc("ptr"(#loc29))
+#loc196 = loc("ptr"(#loc30))
+#loc197 = loc("kv_indices"(#loc32))
+#loc198 = loc("kv_start"(#loc33))
+#loc199 = loc("kv_start"(#loc34))
+#loc200 = loc("kv_num_blocks"(#loc35))
+#loc201 = loc("kv_num_blocks"(#loc36))
+#loc202 = loc("block_n_end"(#loc37))
+#loc203 = loc("block_n_end"(#loc39))
+#loc204 = loc("block_n_end"(#loc41))
+#loc205 = loc("block_n_end"(#loc42))
+#loc206 = loc("offs_n"(#loc43))
+#loc207 = loc("offs_n"(#loc44))
+#loc208 = loc("acc"(#loc46))
+#loc209 = loc("kv_base_offset"(#loc47))
+#loc211 = loc("offs_n_load"(#loc49))
+#loc212 = loc("k"(#loc50))
+#loc213 = loc("k"(#loc51))
+#loc214 = loc("qk"(#loc52))
+#loc215 = loc("qk"(#loc53))
+#loc216 = loc("m"(#loc55))
+#loc217 = loc("n"(#loc56))
+#loc218 = loc("post_mod_scores"(#loc57))
+#loc219 = loc("post_mod_scores"(#loc58))
+#loc220 = loc("tmp3"(#loc59))
+#loc221 = loc("tmp5"(#loc60))
+#loc222 = loc("tmp6"(#loc61))
+#loc223 = loc("tmp7"(#loc62))
+#loc224 = loc("tmp8"(#loc63))
+#loc225 = loc("tmp9"(#loc64))
+#loc226 = loc("tmp10"(#loc65))
+#loc227 = loc("tmp11"(#loc66))
+#loc228 = loc("tmp14"(#loc67))
+#loc229 = loc("tmp14"(#loc68))
+#loc230 = loc("tmp14"(#loc69))
+#loc231 = loc("tmp14"(#loc70))
+#loc232 = loc("tmp14"(#loc71))
+#loc233 = loc("tmp14"(#loc72))
+#loc234 = loc("tmp16"(#loc73))
+#loc235 = loc("tmp16"(#loc74))
+#loc236 = loc("tmp16"(#loc75))
+#loc237 = loc("tmp16"(#loc76))
+#loc238 = loc("tmp16"(#loc77))
+#loc239 = loc("tmp16"(#loc78))
+#loc240 = loc("tmp17"(#loc79))
+#loc241 = loc("tmp18"(#loc80))
+#loc242 = loc("tmp19"(#loc81))
+#loc243 = loc("tmp20"(#loc82))
+#loc244 = loc("mask_mod_output"(#loc83))
+#loc245 = loc("post_mod_scores"(#loc84))
+#loc246 = loc("post_mod_scores"(#loc85))
+#loc248 = loc("m_ij"(#loc89))
+#loc249 = loc("masked_out_rows"(#loc90))
+#loc250 = loc("m_ij_masked"(#loc91))
+#loc251 = loc("alpha"(#loc92))
+#loc252 = loc("alpha"(#loc93))
+#loc253 = loc("p"(#loc94))
+#loc254 = loc("p"(#loc95))
+#loc255 = loc("p"(#loc96))
+#loc256 = loc("l_i"(#loc97))
+#loc258 = loc("l_i"(#loc101))
+#loc259 = loc("acc"(#loc102))
+#loc260 = loc("acc"(#loc103))
+#loc261 = loc("v"(#loc104))
+#loc262 = loc("acc"(#loc105))
+#loc263 = loc("acc"(#loc106))
+#loc264 = loc("cur_block_idx"(#loc107))
+#loc265 = loc("offset"(#loc108))
+#loc266 = loc("cur_block"(#loc109))
+#loc267 = loc("cur_block"(#loc110))
+#loc268 = loc("next_block"(#loc111))
+#loc269 = loc("next_block"(#loc112))
+#loc270 = loc("next_block"(#loc113))
+#loc271 = loc("next_block"(#loc114))
+#loc272 = loc("needs_jump"(#loc115))
+#loc273 = loc("needs_jump"(#loc116))
+#loc274 = loc("needs_jump"(#loc117))
+#loc275 = loc("jump_to_block"(#loc118))
+#loc276 = loc("jump_to_block"(#loc119))
+#loc277 = loc("jump_to_block"(#loc120))
+#loc278 = loc("offset"(#loc121))
+#loc279 = loc("offset"(#loc122))
+#loc280 = loc("offset"(#loc123))
+#loc281 = loc("offset"(#loc124))
+#loc282 = loc("offs_n"(#loc125))
+#loc283 = loc("kv_offset"(#loc126))
+#loc284 = loc(callsite(#loc127 at #loc2))
+#loc285 = loc("kv_indices"(#loc128))
+#loc286 = loc("kv_start"(#loc129))
+#loc287 = loc("kv_start"(#loc130))
+#loc288 = loc("kv_num_blocks"(#loc131))
+#loc289 = loc("kv_num_blocks"(#loc132))
+#loc290 = loc("block_n_end"(#loc133))
+#loc291 = loc("block_n_end"(#loc134))
+#loc292 = loc("offs_n"(#loc135))
+#loc294 = loc(callsite(#loc127 at #loc137))
+#loc295 = loc("l_i"(#loc138))
+#loc296 = loc("acc"(#loc139))
+#loc297 = loc("acc"(#loc140))
+#loc298 = loc("mask"(#loc141))
+#loc299 = loc("off_hz"(#loc146))
+#loc300 = loc("off_hz"(#loc147))
+#loc301 = loc("l_ptrs"(#loc148))
+#loc302 = loc("l_ptrs"(#loc149))
+#loc303 = loc("l_ptrs"(#loc150))
+#loc304 = loc("lse"(#loc151))
+#loc305 = loc("lse"(#loc152))
+#loc306 = loc(callsite(#loc3 at #loc172))
+#loc307 = loc(callsite(#loc5 at #loc173))
+#loc308 = loc(callsite(#loc192 at #loc172))
+#loc309 = loc(callsite(#loc193 at #loc172))
+#loc310 = loc(callsite(#loc194 at #loc172))
+#loc311 = loc(callsite(#loc195 at #loc172))
+#loc312 = loc(callsite(#loc196 at #loc172))
+#loc313 = loc(callsite(#loc31 at #loc172))
+#loc314 = loc(callsite(#loc38 at #loc203))
+#loc315 = loc(callsite(#loc40 at #loc203))
+#loc316 = loc("l_i"(#loc208))
+#loc317 = loc(callsite(#loc209 at #loc210))
+#loc318 = loc(callsite(#loc211 at #loc210))
+#loc319 = loc(callsite(#loc212 at #loc210))
+#loc320 = loc(callsite(#loc213 at #loc210))
+#loc321 = loc(callsite(#loc214 at #loc210))
+#loc322 = loc(callsite(#loc215 at #loc210))
+#loc323 = loc(callsite(#loc216 at #loc210))
+#loc324 = loc(callsite(#loc217 at #loc210))
+#loc325 = loc(callsite(#loc218 at #loc210))
+#loc326 = loc(callsite(#loc219 at #loc210))
+#loc327 = loc(callsite(#loc220 at #loc210))
+#loc328 = loc(callsite(#loc221 at #loc210))
+#loc329 = loc(callsite(#loc222 at #loc210))
+#loc330 = loc(callsite(#loc223 at #loc210))
+#loc331 = loc(callsite(#loc224 at #loc210))
+#loc332 = loc(callsite(#loc225 at #loc210))
+#loc333 = loc(callsite(#loc226 at #loc210))
+#loc334 = loc(callsite(#loc227 at #loc210))
+#loc335 = loc(callsite(#loc228 at #loc210))
+#loc336 = loc(callsite(#loc229 at #loc210))
+#loc337 = loc(callsite(#loc230 at #loc210))
+#loc338 = loc(callsite(#loc231 at #loc210))
+#loc339 = loc(callsite(#loc232 at #loc210))
+#loc340 = loc(callsite(#loc233 at #loc210))
+#loc341 = loc(callsite(#loc234 at #loc210))
+#loc342 = loc(callsite(#loc235 at #loc210))
+#loc343 = loc(callsite(#loc236 at #loc210))
+#loc344 = loc(callsite(#loc237 at #loc210))
+#loc345 = loc(callsite(#loc238 at #loc210))
+#loc346 = loc(callsite(#loc239 at #loc210))
+#loc347 = loc(callsite(#loc240 at #loc210))
+#loc348 = loc(callsite(#loc241 at #loc210))
+#loc349 = loc(callsite(#loc242 at #loc210))
+#loc350 = loc(callsite(#loc243 at #loc210))
+#loc351 = loc(callsite(#loc244 at #loc210))
+#loc352 = loc(callsite(#loc245 at #loc210))
+#loc353 = loc(callsite(#loc246 at #loc210))
+#loc355 = loc(callsite(#loc248 at #loc210))
+#loc356 = loc(callsite(#loc249 at #loc210))
+#loc357 = loc(callsite(#loc250 at #loc210))
+#loc358 = loc(callsite(#loc251 at #loc210))
+#loc359 = loc(callsite(#loc252 at #loc210))
+#loc360 = loc(callsite(#loc253 at #loc210))
+#loc361 = loc(callsite(#loc254 at #loc210))
+#loc362 = loc(callsite(#loc255 at #loc210))
+#loc363 = loc(callsite(#loc256 at #loc210))
+#loc365 = loc(callsite(#loc258 at #loc210))
+#loc366 = loc(callsite(#loc259 at #loc210))
+#loc367 = loc(callsite(#loc260 at #loc210))
+#loc368 = loc(callsite(#loc261 at #loc210))
+#loc369 = loc(callsite(#loc262 at #loc210))
+#loc370 = loc(callsite(#loc263 at #loc210))
+#loc371 = loc(callsite(#loc265 at #loc2))
+#loc372 = loc(callsite(#loc282 at #loc2))
+#loc373 = loc(callsite(#loc283 at #loc2))
+#loc374 = loc(callsite(#loc209 at #loc293))
+#loc375 = loc(callsite(#loc211 at #loc293))
+#loc376 = loc(callsite(#loc212 at #loc293))
+#loc377 = loc(callsite(#loc213 at #loc293))
+#loc378 = loc(callsite(#loc214 at #loc293))
+#loc379 = loc(callsite(#loc215 at #loc293))
+#loc380 = loc(callsite(#loc218 at #loc293))
+#loc381 = loc(callsite(#loc219 at #loc293))
+#loc382 = loc(callsite(#loc246 at #loc293))
+#loc384 = loc(callsite(#loc248 at #loc293))
+#loc385 = loc(callsite(#loc249 at #loc293))
+#loc386 = loc(callsite(#loc250 at #loc293))
+#loc387 = loc(callsite(#loc251 at #loc293))
+#loc388 = loc(callsite(#loc252 at #loc293))
+#loc389 = loc(callsite(#loc253 at #loc293))
+#loc390 = loc(callsite(#loc254 at #loc293))
+#loc391 = loc(callsite(#loc255 at #loc293))
+#loc392 = loc(callsite(#loc256 at #loc293))
+#loc394 = loc(callsite(#loc258 at #loc293))
+#loc395 = loc(callsite(#loc259 at #loc293))
+#loc396 = loc(callsite(#loc260 at #loc293))
+#loc397 = loc(callsite(#loc261 at #loc293))
+#loc398 = loc(callsite(#loc262 at #loc293))
+#loc399 = loc(callsite(#loc263 at #loc293))
+#loc400 = loc(callsite(#loc265 at #loc137))
+#loc401 = loc(callsite(#loc282 at #loc137))
+#loc402 = loc(callsite(#loc283 at #loc137))
+#loc403 = loc("m_i"(#loc316))
+#loc404 = loc(callsite(#loc192 at #loc319))
+#loc405 = loc(callsite(#loc193 at #loc319))
+#loc406 = loc(callsite(#loc194 at #loc319))
+#loc407 = loc(callsite(#loc196 at #loc319))
+#loc408 = loc(callsite(#loc31 at #loc319))
+#loc409 = loc(callsite(#loc3 at #loc319))
+#loc410 = loc(callsite(#loc54 at #loc323))
+#loc411 = loc(callsite(#loc54 at #loc324))
+#loc412 = loc(callsite(#loc86 at #loc354))
+#loc414 = loc(callsite(#loc98 at #loc364))
+#loc416 = loc(callsite(#loc194 at #loc368))
+#loc417 = loc(callsite(#loc196 at #loc368))
+#loc418 = loc(callsite(#loc3 at #loc368))
+#loc419 = loc(callsite(#loc264 at #loc371))
+#loc420 = loc(callsite(#loc266 at #loc371))
+#loc421 = loc(callsite(#loc267 at #loc371))
+#loc422 = loc(callsite(#loc268 at #loc371))
+#loc423 = loc(callsite(#loc269 at #loc371))
+#loc424 = loc(callsite(#loc270 at #loc371))
+#loc425 = loc(callsite(#loc271 at #loc371))
+#loc426 = loc(callsite(#loc272 at #loc371))
+#loc427 = loc(callsite(#loc273 at #loc371))
+#loc428 = loc(callsite(#loc274 at #loc371))
+#loc429 = loc(callsite(#loc275 at #loc371))
+#loc430 = loc(callsite(#loc276 at #loc371))
+#loc431 = loc(callsite(#loc277 at #loc371))
+#loc432 = loc(callsite(#loc278 at #loc371))
+#loc433 = loc(callsite(#loc279 at #loc371))
+#loc434 = loc(callsite(#loc280 at #loc371))
+#loc435 = loc(callsite(#loc281 at #loc371))
+#loc436 = loc(callsite(#loc192 at #loc376))
+#loc437 = loc(callsite(#loc193 at #loc376))
+#loc438 = loc(callsite(#loc194 at #loc376))
+#loc439 = loc(callsite(#loc196 at #loc376))
+#loc440 = loc(callsite(#loc31 at #loc376))
+#loc441 = loc(callsite(#loc3 at #loc376))
+#loc442 = loc(callsite(#loc86 at #loc383))
+#loc444 = loc(callsite(#loc98 at #loc393))
+#loc446 = loc(callsite(#loc194 at #loc397))
+#loc447 = loc(callsite(#loc196 at #loc397))
+#loc448 = loc(callsite(#loc3 at #loc397))
+#loc449 = loc(callsite(#loc264 at #loc400))
+#loc450 = loc(callsite(#loc266 at #loc400))
+#loc451 = loc(callsite(#loc267 at #loc400))
+#loc452 = loc(callsite(#loc268 at #loc400))
+#loc453 = loc(callsite(#loc269 at #loc400))
+#loc454 = loc(callsite(#loc270 at #loc400))
+#loc455 = loc(callsite(#loc271 at #loc400))
+#loc456 = loc(callsite(#loc272 at #loc400))
+#loc457 = loc(callsite(#loc273 at #loc400))
+#loc458 = loc(callsite(#loc274 at #loc400))
+#loc459 = loc(callsite(#loc275 at #loc400))
+#loc460 = loc(callsite(#loc276 at #loc400))
+#loc461 = loc(callsite(#loc277 at #loc400))
+#loc462 = loc(callsite(#loc278 at #loc400))
+#loc463 = loc(callsite(#loc279 at #loc400))
+#loc464 = loc(callsite(#loc280 at #loc400))
+#loc465 = loc(callsite(#loc281 at #loc400))
+#loc466 = loc("offs_n"(#loc403))
+#loc467 = loc(callsite(#loc88 at #loc412))
+#loc468 = loc(callsite(#loc100 at #loc414))
+#loc469 = loc(callsite(#loc88 at #loc442))
+#loc470 = loc(callsite(#loc100 at #loc444))
+#loc471 = loc("kv_offset"(#loc466))
+#loc472 = loc(callsite(#loc471 at #loc2))
+#loc473 = loc(callsite(#loc471 at #loc137))

progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/__grp__triton_poi_fused_mul_1.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"child_paths": {"triton_poi_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source", "triton_poi_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir", "triton_poi_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir", "triton_poi_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir", "triton_poi_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx", "triton_poi_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin", "triton_poi_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json"}}

progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin ADDED Viewed

Binary file (10.3 kB). View file

progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"hash": "d650530c018e98a61be4958bef98391bf5f4932885981bbdd4c94cc375f6e8e4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_1"}

progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir ADDED Viewed

	@@ -0,0 +1,89 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 7, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 127, !dbg !9
+  %11 = or disjoint i32 %8, %10, !dbg !10
+  %12 = icmp slt i32 %11, %3, !dbg !11
+  %13 = sext i32 %11 to i64, !dbg !12
+  %.frozen = freeze i64 %2, !dbg !13
+  %14 = sdiv i64 %13, %.frozen, !dbg !13
+  %15 = mul i64 %14, %.frozen, !dbg !12
+  %.decomposed = sub i64 %13, %15, !dbg !12
+  %.not = icmp ne i64 %.decomposed, 0, !dbg !17
+  %16 = icmp slt i32 %8, 0, !dbg !18
+  %17 = icmp slt i64 %2, 0, !dbg !19
+  %18 = xor i1 %16, %17, !dbg !20
+  %narrow = select i1 %18, i1 %.not, i1 false, !dbg !21
+  %19 = sext i1 %narrow to i64, !dbg !21
+  %20 = add nsw i64 %14, %19, !dbg !21
+  %21 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !22
+  %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !23
+  %23 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %21, i64 %22, i1 %12) #2, !dbg !23
+  %24 = bitcast i32 %23 to float, !dbg !23
+  %25 = fmul float %24, 0x3FE62E4300000000, !dbg !24
+  %26 = icmp slt i64 %2, 2, !dbg !25
+  %27 = icmp sgt i64 %2, 1, !dbg !26
+  %28 = select i1 %27, i64 %2, i64 0, !dbg !27
+  %29 = zext i1 %26 to i64, !dbg !28
+  %30 = add i64 %28, %29, !dbg !29
+  %31 = mul i64 %20, %30, !dbg !30
+  %32 = getelementptr float, ptr addrspace(1) %1, i64 %.decomposed, !dbg !31
+  %33 = getelementptr float, ptr addrspace(1) %32, i64 %31, !dbg !31
+  %34 = bitcast float %25 to i32, !dbg !32
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %34, ptr addrspace(1) %33, i1 %12) #2, !dbg !32
+  ret void, !dbg !33
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_1", linkageName: "triton_poi_fused_mul_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 19, column: 28, scope: !4)
+!8 = !DILocation(line: 19, column: 33, scope: !4)
+!9 = !DILocation(line: 20, column: 36, scope: !4)
+!10 = !DILocation(line: 20, column: 23, scope: !4)
+!11 = !DILocation(line: 21, column: 21, scope: !4)
+!12 = !DILocation(line: 23, column: 19, scope: !4)
+!13 = !DILocation(line: 72, column: 16, scope: !14, inlinedAt: !16)
+!14 = distinct !DILexicalBlockFile(scope: !4, file: !15, discriminator: 0)
+!15 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
+!16 = !DILocation(line: 24, column: 51, scope: !4)
+!17 = !DILocation(line: 74, column: 34, scope: !14, inlinedAt: !16)
+!18 = !DILocation(line: 75, column: 25, scope: !14, inlinedAt: !16)
+!19 = !DILocation(line: 75, column: 36, scope: !14, inlinedAt: !16)
+!20 = !DILocation(line: 75, column: 32, scope: !14, inlinedAt: !16)
+!21 = !DILocation(line: 75, column: 47, scope: !14, inlinedAt: !16)
+!22 = !DILocation(line: 25, column: 30, scope: !4)
+!23 = !DILocation(line: 25, column: 35, scope: !4)
+!24 = !DILocation(line: 27, column: 18, scope: !4)
+!25 = !DILocation(line: 28, column: 49, scope: !4)
+!26 = !DILocation(line: 28, column: 75, scope: !4)
+!27 = !DILocation(line: 28, column: 66, scope: !4)
+!28 = !DILocation(line: 28, scope: !4)
+!29 = !DILocation(line: 28, column: 57, scope: !4)
+!30 = !DILocation(line: 28, column: 34, scope: !4)
+!31 = !DILocation(line: 28, column: 25, scope: !4)
+!32 = !DILocation(line: 28, column: 88, scope: !4)
+!33 = !DILocation(line: 28, column: 4, scope: !4)

progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx ADDED Viewed

	@@ -0,0 +1,357 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.7
+.target sm_90a
+.address_size 64
+	// .globl	triton_poi_fused_mul_1  // -- Begin function triton_poi_fused_mul_1
+                                        // @triton_poi_fused_mul_1
+.visible .entry triton_poi_fused_mul_1(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_1,
+	.param .u64 triton_poi_fused_mul_1_param_2,
+	.param .u32 triton_poi_fused_mul_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_5
+)
+.reqntid 128
+{
+	.reg .pred 	%p<11>;
+	.reg .b32 	%r<13>;
+	.reg .b64 	%rd<30>;
+	.loc	1 18 0                          // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:18:0
+// %bb.0:
+	ld.param.b32 	%r2, [triton_poi_fused_mul_1_param_3];
+	ld.param.b64 	%rd7, [triton_poi_fused_mul_1_param_1];
+	ld.param.b64 	%rd6, [triton_poi_fused_mul_1_param_0];
+	ld.param.b64 	%rd8, [triton_poi_fused_mul_1_param_2];
+$L__tmp0:
+	.loc	1 19 28                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:19:28
+	mov.u32 	%r3, %ctaid.x;
+	.loc	1 19 33                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:19:33
+	shl.b32 	%r1, %r3, 7;
+	.loc	1 20 36                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:20:36
+	mov.u32 	%r4, %tid.x;
+	and.b32 	%r5, %r4, 127;
+	.loc	1 20 23                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:20:23
+	or.b32 	%r6, %r1, %r5;
+	.loc	1 23 19                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:23:19
+	cvt.s64.s32 	%rd1, %r6;
+$L__tmp1:
+	.loc	2 72 16                         // triton_helpers.py:72:16 @[ ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:24:51 ]
+	or.b64 	%rd10, %rd1, %rd8;
+	and.b64 	%rd11, %rd10, -4294967296;
+	setp.ne.b64 	%p1, %rd11, 0;
+	@%p1 bra 	$L__BB0_2;
+	bra.uni 	$L__BB0_1;
+$L__BB0_2:
+	div.s64 	%rd29, %rd1, %rd8;
+	bra.uni 	$L__BB0_3;
+$L__BB0_1:
+	cvt.u32.u64 	%r7, %rd8;
+	cvt.u32.u64 	%r8, %rd1;
+	div.u32 	%r9, %r8, %r7;
+	cvt.u64.u32 	%rd29, %r9;
+$L__tmp2:
+$L__BB0_3:
+	.loc	2 0 16                          // triton_helpers.py:0:16
+	cvt.u32.u64 	%r12, %rd1;
+	.loc	1 21 21                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:21:21
+	setp.lt.s32 	%p2, %r12, %r2;
+	.loc	1 23 19                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:23:19
+	mul.lo.s64 	%rd17, %rd29, %rd8;
+	sub.s64 	%rd18, %rd1, %rd17;
+$L__tmp3:
+	.loc	2 74 34                         // triton_helpers.py:74:34 @[ ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:24:51 ]
+	setp.ne.b64 	%p4, %rd18, 0;
+	.loc	2 75 25                         // triton_helpers.py:75:25 @[ ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:24:51 ]
+	setp.lt.s32 	%p5, %r1, 0;
+	.loc	2 75 36                         // triton_helpers.py:75:36 @[ ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:24:51 ]
+	setp.lt.s64 	%p6, %rd8, 0;
+	.loc	2 75 32                         // triton_helpers.py:75:32 @[ ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:24:51 ]
+	xor.pred 	%p7, %p5, %p6;
+	.loc	2 75 47                         // triton_helpers.py:75:47 @[ ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:24:51 ]
+	and.pred 	%p8, %p7, %p4;
+	selp.b64 	%rd19, -1, 0, %p8;
+	add.s64 	%rd20, %rd29, %rd19;
+$L__tmp4:
+	.loc	1 25 30                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:25:30
+	shl.b64 	%rd21, %rd1, 2;
+	add.s64 	%rd13, %rd6, %rd21;
+	.loc	1 25 35                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:25:35
+	// begin inline asm
+	mov.u64 %rd14, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, 0x0;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10 }, [ %rd13 + 0 ], %rd14;
+	// end inline asm
+	.loc	1 27 18                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:27:18
+	mul.f32 	%r11, %r10, 0f3F317218;
+	.loc	1 28 49                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:28:49
+	setp.lt.s64 	%p9, %rd8, 2;
+	.loc	1 28 75                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:28:75
+	setp.gt.s64 	%p10, %rd8, 1;
+	.loc	1 28 66                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:28:66
+	selp.b64 	%rd22, %rd8, 0, %p10;
+	.loc	1 28 0                          // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:28
+	selp.b64 	%rd23, 1, 0, %p9;
+	.loc	1 28 57                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:28:57
+	add.s64 	%rd24, %rd22, %rd23;
+	.loc	1 28 34                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:28:34
+	mul.lo.s64 	%rd25, %rd20, %rd24;
+	.loc	1 28 25                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:28:25
+	shl.b64 	%rd26, %rd18, 2;
+	add.s64 	%rd27, %rd7, %rd26;
+	shl.b64 	%rd28, %rd25, 2;
+	add.s64 	%rd15, %rd27, %rd28;
+	.loc	1 28 88                         // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:28:88
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd15 + 0 ], { %r11 };
+	// end inline asm
+	.loc	1 28 4                          // ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py:28:4
+	ret;
+$L__tmp5:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py"
+	.file	2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 211                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xcc DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 107
+.b8 99
+.b8 114
+.b8 120
+.b8 101
+.b8 51
+.b8 101
+.b8 105
+.b8 105
+.b8 118
+.b8 99
+.b8 108
+.b8 101
+.b8 52
+.b8 112
+.b8 122
+.b8 122
+.b8 102
+.b8 110
+.b8 111
+.b8 120
+.b8 102
+.b8 108
+.b8 98
+.b8 103
+.b8 108
+.b8 53
+.b8 119
+.b8 112
+.b8 50
+.b8 115
+.b8 115
+.b8 100
+.b8 102
+.b8 113
+.b8 100
+.b8 50
+.b8 119
+.b8 119
+.b8 118
+.b8 103
+.b8 101
+.b8 121
+.b8 118
+.b8 108
+.b8 112
+.b8 98
+.b8 116
+.b8 105
+.b8 119
+.b8 120
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 119
+.b8 111
+.b8 114
+.b8 107
+.b8 115
+.b8 112
+.b8 97
+.b8 99
+.b8 101
+.b8 47
+.b8 104
+.b8 97
+.b8 110
+.b8 114
+.b8 117
+.b8 105
+.b8 47
+.b8 106
+.b8 117
+.b8 110
+.b8 113
+.b8 117
+.b8 97
+.b8 110
+.b8 47
+.b8 83
+.b8 112
+.b8 101
+.b8 99
+.b8 70
+.b8 111
+.b8 114
+.b8 103
+.b8 101
+.b8 47
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 107
+.b8 101
+.b8 114
+.b8 110
+.b8 101
+.b8 108
+.b8 115
+.b8 47
+.b8 107
+.b8 99
+.b8 0
+.b8 2                                   // Abbrev [2] 0x8f:0x19 DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 112
+.b8 111
+.b8 105
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 49
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0xa8:0x2e DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 143                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0xbd:0x18 DW_TAG_inlined_subroutine
+.b32 143                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp4                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 24                                  // DW_AT_call_line
+.b8 51                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}

progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source ADDED Viewed

	@@ -0,0 +1,130 @@

+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":18:0)
+#loc22 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0)
+#loc34 = loc("in_ptr0"(#loc))
+#loc35 = loc("out_ptr0"(#loc))
+#loc36 = loc("ks0"(#loc))
+#loc37 = loc("xnumel"(#loc))
+#loc49 = loc("a"(#loc22))
+#loc50 = loc("b"(#loc22))
+module {
+  tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xoffset = tt.get_program_id x : i32 loc(#loc38)
+    %xoffset_0 = arith.constant 128 : i32 loc(#loc39)
+    %xoffset_1 = arith.constant 128 : i32 loc(#loc39)
+    %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc39)
+    %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc40)
+    %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc41)
+    %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc41)
+    %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc42)
+    %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc42)
+    %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc43)
+    %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc43)
+    %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc43)
+    %x1 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128S_i64__(%xindex_4, %ks0) : (tensor<128xi32>, i64) -> tensor<128xi64> loc(#loc44)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc45)
+    %tmp0_8 = tt.addptr %tmp0, %xindex_4 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc45)
+    %tmp0_9 = tt.load %tmp0_8, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>> loc(#loc46)
+    %tmp1 = arith.constant 0.693147182 : f32 loc(#loc47)
+    %tmp2 = arith.constant dense<0.693147182> : tensor<128xf32> loc(#loc48)
+    %tmp2_10 = arith.mulf %tmp0_9, %tmp2 : tensor<128xf32> loc(#loc48)
+    %c1_i32 = arith.constant 1 : i32 loc(#loc12)
+    %0 = arith.extsi %c1_i32 : i32 to i64 loc(#loc12)
+    %1 = arith.cmpi sge, %0, %ks0 : i64 loc(#loc12)
+    %c1_i32_11 = arith.constant 1 : i32 loc(#loc13)
+    %c1_i32_12 = arith.constant 1 : i32 loc(#loc13)
+    %2 = arith.extui %1 : i1 to i32 loc(#loc13)
+    %3 = arith.muli %c1_i32_12, %2 : i32 loc(#loc13)
+    %c1_i32_13 = arith.constant 1 : i32 loc(#loc14)
+    %4 = arith.extsi %c1_i32_13 : i32 to i64 loc(#loc14)
+    %5 = arith.cmpi sgt, %ks0, %4 : i64 loc(#loc14)
+    %6 = arith.extui %5 : i1 to i64 loc(#loc15)
+    %7 = arith.muli %ks0, %6 : i64 loc(#loc15)
+    %8 = arith.extsi %3 : i32 to i64 loc(#loc16)
+    %9 = arith.addi %8, %7 : i64 loc(#loc16)
+    %10 = tt.splat %9 : i64 -> tensor<128xi64> loc(#loc17)
+    %11 = arith.muli %x1, %10 : tensor<128xi64> loc(#loc17)
+    %12 = arith.addi %x0_7, %11 : tensor<128xi64> loc(#loc18)
+    %13 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc19)
+    %14 = tt.addptr %13, %12 : tensor<128x!tt.ptr<f32>>, tensor<128xi64> loc(#loc19)
+    tt.store %14, %tmp2_10, %xmask_5 : tensor<128x!tt.ptr<f32>> loc(#loc20)
+    tt.return loc(#loc21)
+  } loc(#loc)
+  tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128S_i64__(%a: tensor<128xi32> loc("a"(#loc22)), %b: i64 loc("b"(#loc22))) -> tensor<128xi64> attributes {noinline = false} {
+    %quot = arith.extsi %a : tensor<128xi32> to tensor<128xi64> loc(#loc51)
+    %quot_0 = tt.splat %b : i64 -> tensor<128xi64> loc(#loc51)
+    %quot_1 = arith.divsi %quot, %quot_0 : tensor<128xi64> loc(#loc51)
+    %remainder = arith.extsi %a : tensor<128xi32> to tensor<128xi64> loc(#loc52)
+    %remainder_2 = tt.splat %b : i64 -> tensor<128xi64> loc(#loc52)
+    %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<128xi64> loc(#loc52)
+    %fixed = arith.constant 0 : i32 loc(#loc53)
+    %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc53)
+    %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<128xi64> loc(#loc53)
+    %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<128xi64> loc(#loc53)
+    %fixed_7 = arith.constant 1 : i32 loc(#loc54)
+    %fixed_8 = arith.constant 1 : i64 loc(#loc54)
+    %fixed_9 = arith.constant dense<1> : tensor<128xi64> loc(#loc54)
+    %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<128xi64> loc(#loc54)
+    %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<128xi1>, tensor<128xi64> loc(#loc55)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc28)
+    %cst = arith.constant dense<0> : tensor<128xi32> loc(#loc28)
+    %0 = arith.cmpi slt, %a, %cst : tensor<128xi32> loc(#loc28)
+    %c0_i32_12 = arith.constant 0 : i32 loc(#loc29)
+    %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc29)
+    %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc29)
+    %3 = tt.splat %2 : i1 -> tensor<128xi1> loc(#loc30)
+    %4 = arith.cmpi ne, %0, %3 : tensor<128xi1> loc(#loc30)
+    %5 = arith.select %4, %fixed_11, %quot_1 : tensor<128xi1>, tensor<128xi64> loc(#loc31)
+    tt.return %5 : tensor<128xi64> loc(#loc32)
+  ^bb1:  // no predecessors
+    %6 = ub.poison : tensor<128xi64> loc(#loc33)
+    tt.return %6 : tensor<128xi64> loc(#loc33)
+  } loc(#loc22)
+} loc(#loc)
+#loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":19:28)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":19:33)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":20:36)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":20:23)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":21:21)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":23:19)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":24:51)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":25:30)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":25:35)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":26:11)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":27:18)
+#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:49)
+#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:41)
+#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:75)
+#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:66)
+#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:57)
+#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:34)
+#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:30)
+#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:25)
+#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:88)
+#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:4)
+#loc23 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
+#loc24 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20)
+#loc25 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
+#loc26 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
+#loc27 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
+#loc28 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
+#loc29 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
+#loc30 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
+#loc31 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
+#loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11)
+#loc33 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4)
+#loc38 = loc("xoffset"(#loc1))
+#loc39 = loc("xoffset"(#loc2))
+#loc40 = loc("xindex"(#loc3))
+#loc41 = loc("xindex"(#loc4))
+#loc42 = loc("xmask"(#loc5))
+#loc43 = loc("x0"(#loc6))
+#loc44 = loc("x1"(#loc7))
+#loc45 = loc("tmp0"(#loc8))
+#loc46 = loc("tmp0"(#loc9))
+#loc47 = loc("tmp1"(#loc10))
+#loc48 = loc("tmp2"(#loc11))
+#loc51 = loc("quot"(#loc23))
+#loc52 = loc("remainder"(#loc24))
+#loc53 = loc("fixed"(#loc25))
+#loc54 = loc("fixed"(#loc26))
+#loc55 = loc("fixed"(#loc27))

progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir ADDED Viewed

	@@ -0,0 +1,105 @@

+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":18:0)
+#loc30 = loc("in_ptr0"(#loc))
+#loc31 = loc("out_ptr0"(#loc))
+#loc32 = loc("ks0"(#loc))
+#loc33 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %cst = arith.constant dense<0.693147182> : tensor<128xf32, #blocked> loc(#loc1)
+    %c1_i64 = arith.constant 1 : i64 loc(#loc1)
+    %c0_i64 = arith.constant 0 : i64 loc(#loc1)
+    %cst_0 = arith.constant dense<0> : tensor<128xi64, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<0> : tensor<128xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<1> : tensor<128xi64, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_3 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc36)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<128xi32, #blocked> loc(#loc37)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<128xi32, #blocked> loc(#loc37)
+    %xmask = tt.splat %xnumel : i32 -> tensor<128xi32, #blocked> loc(#loc38)
+    %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<128xi32, #blocked> loc(#loc38)
+    %x0 = arith.extsi %xindex_5 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> loc(#loc39)
+    %x0_7 = tt.splat %ks0 : i64 -> tensor<128xi64, #blocked> loc(#loc39)
+    %x0_8 = arith.remsi %x0, %x0_7 : tensor<128xi64, #blocked> loc(#loc39)
+    %quot = arith.divsi %x0, %x0_7 : tensor<128xi64, #blocked> loc(#loc49)
+    %fixed = arith.cmpi ne, %x0_8, %cst_0 : tensor<128xi64, #blocked> loc(#loc50)
+    %fixed_9 = arith.subi %quot, %cst_2 : tensor<128xi64, #blocked> loc(#loc51)
+    %fixed_10 = arith.select %fixed, %fixed_9, %quot : tensor<128xi1, #blocked>, tensor<128xi64, #blocked> loc(#loc52)
+    %x1 = arith.cmpi slt, %xindex_5, %cst_1 : tensor<128xi32, #blocked> loc(#loc53)
+    %x1_11 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc54)
+    %x1_12 = tt.splat %x1_11 : i1 -> tensor<128xi1, #blocked> loc(#loc55)
+    %x1_13 = arith.cmpi ne, %x1, %x1_12 : tensor<128xi1, #blocked> loc(#loc55)
+    %x1_14 = arith.select %x1_13, %fixed_10, %quot : tensor<128xi1, #blocked>, tensor<128xi64, #blocked> loc(#loc56)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked> loc(#loc45)
+    %tmp0_15 = tt.addptr %tmp0, %xindex_5 : tensor<128x!tt.ptr<f32>, #blocked>, tensor<128xi32, #blocked> loc(#loc45)
+    %tmp0_16 = tt.load %tmp0_15, %xmask_6 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>, #blocked> loc(#loc46)
+    %tmp2 = arith.mulf %tmp0_16, %cst : tensor<128xf32, #blocked> loc(#loc47)
+    %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc20)
+    %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc21)
+    %2 = arith.extui %1 : i1 to i64 loc(#loc22)
+    %3 = arith.muli %ks0, %2 : i64 loc(#loc22)
+    %4 = arith.extui %0 : i1 to i64 loc(#loc48)
+    %5 = arith.addi %4, %3 : i64 loc(#loc23)
+    %6 = tt.splat %5 : i64 -> tensor<128xi64, #blocked> loc(#loc25)
+    %7 = arith.muli %x1_14, %6 : tensor<128xi64, #blocked> loc(#loc25)
+    %8 = arith.addi %x0_8, %7 : tensor<128xi64, #blocked> loc(#loc26)
+    %9 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked> loc(#loc27)
+    %10 = tt.addptr %9, %8 : tensor<128x!tt.ptr<f32>, #blocked>, tensor<128xi64, #blocked> loc(#loc27)
+    tt.store %10, %tmp2, %xmask_6 : tensor<128x!tt.ptr<f32>, #blocked> loc(#loc28)
+    tt.return loc(#loc29)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":19:28)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":19:33)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":20:36)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":20:23)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":21:21)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":23:19)
+#loc8 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":24:51)
+#loc10 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
+#loc11 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
+#loc12 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
+#loc13 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
+#loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
+#loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
+#loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
+#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":25:30)
+#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":25:35)
+#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":27:18)
+#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:49)
+#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:75)
+#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:66)
+#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:57)
+#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:41)
+#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:34)
+#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:30)
+#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:25)
+#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:88)
+#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:4)
+#loc34 = loc("xoffset"(#loc2))
+#loc35 = loc("xoffset"(#loc3))
+#loc36 = loc("xindex"(#loc4))
+#loc37 = loc("xindex"(#loc5))
+#loc38 = loc("xmask"(#loc6))
+#loc39 = loc("x0"(#loc7))
+#loc40 = loc("quot"(#loc8))
+#loc41 = loc("x1"(#loc9))
+#loc42 = loc("fixed"(#loc10))
+#loc43 = loc("fixed"(#loc11))
+#loc44 = loc("fixed"(#loc12))
+#loc45 = loc("tmp0"(#loc17))
+#loc46 = loc("tmp0"(#loc18))
+#loc47 = loc("tmp2"(#loc19))
+#loc48 = loc(fused[#loc23, #loc24])
+#loc49 = loc(callsite(#loc40 at #loc41))
+#loc50 = loc(callsite(#loc42 at #loc41))
+#loc51 = loc(callsite(#loc43 at #loc41))
+#loc52 = loc(callsite(#loc44 at #loc41))
+#loc53 = loc(callsite(#loc13 at #loc41))
+#loc54 = loc(callsite(#loc14 at #loc41))
+#loc55 = loc(callsite(#loc15 at #loc41))
+#loc56 = loc(callsite(#loc16 at #loc41))

progress/github/SpecForge/cache/compiled_kernels/triton/1/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir ADDED Viewed

	@@ -0,0 +1,104 @@

+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":18:0)
+#loc30 = loc("in_ptr0"(#loc))
+#loc31 = loc("out_ptr0"(#loc))
+#loc32 = loc("ks0"(#loc))
+#loc33 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %fixed = arith.constant dense<1> : tensor<128xi64> loc(#loc49)
+    %x1 = arith.constant dense<0> : tensor<128xi32> loc(#loc50)
+    %fixed_0 = arith.constant dense<0> : tensor<128xi64> loc(#loc51)
+    %x1_1 = arith.constant 0 : i64 loc(#loc52)
+    %c1_i64 = arith.constant 1 : i64 loc(#loc6)
+    %tmp2 = arith.constant dense<0.693147182> : tensor<128xf32> loc(#loc37)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc6)
+    %xoffset = tt.get_program_id x : i32 loc(#loc38)
+    %xoffset_2 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc39)
+    %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc40)
+    %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc41)
+    %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc41)
+    %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc42)
+    %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc42)
+    %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc43)
+    %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc43)
+    %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc43)
+    %quot = arith.divsi %x0, %x0_6 : tensor<128xi64> loc(#loc53)
+    %fixed_8 = arith.cmpi ne, %x0_7, %fixed_0 : tensor<128xi64> loc(#loc51)
+    %fixed_9 = arith.subi %quot, %fixed : tensor<128xi64> loc(#loc49)
+    %fixed_10 = arith.select %fixed_8, %fixed_9, %quot : tensor<128xi1>, tensor<128xi64> loc(#loc54)
+    %x1_11 = arith.cmpi slt, %xindex_4, %x1 : tensor<128xi32> loc(#loc50)
+    %x1_12 = arith.cmpi slt, %ks0, %x1_1 : i64 loc(#loc52)
+    %x1_13 = tt.splat %x1_12 : i1 -> tensor<128xi1> loc(#loc55)
+    %x1_14 = arith.cmpi ne, %x1_11, %x1_13 : tensor<128xi1> loc(#loc55)
+    %x1_15 = arith.select %x1_14, %fixed_10, %quot : tensor<128xi1>, tensor<128xi64> loc(#loc56)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc46)
+    %tmp0_16 = tt.addptr %tmp0, %xindex_4 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc46)
+    %tmp0_17 = tt.load %tmp0_16, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>> loc(#loc47)
+    %tmp2_18 = arith.mulf %tmp0_17, %tmp2 : tensor<128xf32> loc(#loc37)
+    %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc20)
+    %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc21)
+    %2 = arith.extui %1 : i1 to i64 loc(#loc22)
+    %3 = arith.muli %ks0, %2 : i64 loc(#loc22)
+    %4 = arith.extui %0 : i1 to i64 loc(#loc48)
+    %5 = arith.addi %4, %3 : i64 loc(#loc23)
+    %6 = tt.splat %5 : i64 -> tensor<128xi64> loc(#loc25)
+    %7 = arith.muli %x1_15, %6 : tensor<128xi64> loc(#loc25)
+    %8 = arith.addi %x0_7, %7 : tensor<128xi64> loc(#loc26)
+    %9 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc27)
+    %10 = tt.addptr %9, %8 : tensor<128x!tt.ptr<f32>>, tensor<128xi64> loc(#loc27)
+    tt.store %10, %tmp2_18, %xmask_5 : tensor<128x!tt.ptr<f32>> loc(#loc28)
+    tt.return loc(#loc29)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":24:51)
+#loc3 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
+#loc4 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
+#loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
+#loc6 = loc(unknown)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":27:18)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":19:28)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":19:33)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":20:36)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":20:23)
+#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":21:21)
+#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":23:19)
+#loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
+#loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
+#loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
+#loc17 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
+#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":25:30)
+#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":25:35)
+#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:49)
+#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:75)
+#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:66)
+#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:57)
+#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:41)
+#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:34)
+#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:30)
+#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:25)
+#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:88)
+#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/kc/ckcrxe3eiivcle4pzzfnoxflbgl5wp2ssdfqd2wwvgeyvlpbtiwx.py":28:4)
+#loc34 = loc("fixed"(#loc1))
+#loc35 = loc("x1"(#loc2))
+#loc36 = loc("fixed"(#loc4))
+#loc37 = loc("tmp2"(#loc7))
+#loc38 = loc("xoffset"(#loc8))
+#loc39 = loc("xoffset"(#loc9))
+#loc40 = loc("xindex"(#loc10))
+#loc41 = loc("xindex"(#loc11))
+#loc42 = loc("xmask"(#loc12))
+#loc43 = loc("x0"(#loc13))
+#loc44 = loc("quot"(#loc14))
+#loc45 = loc("fixed"(#loc15))
+#loc46 = loc("tmp0"(#loc18))
+#loc47 = loc("tmp0"(#loc19))
+#loc48 = loc(fused[#loc23, #loc24])
+#loc49 = loc(callsite(#loc34 at #loc35))
+#loc50 = loc(callsite(#loc3 at #loc35))
+#loc51 = loc(callsite(#loc36 at #loc35))
+#loc52 = loc(callsite(#loc5 at #loc35))
+#loc53 = loc(callsite(#loc44 at #loc35))
+#loc54 = loc(callsite(#loc45 at #loc35))
+#loc55 = loc(callsite(#loc16 at #loc35))
+#loc56 = loc(callsite(#loc17 at #loc35))

progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/__grp__triton_red_fused_mul_0.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"child_paths": {"triton_red_fused_mul_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.source", "triton_red_fused_mul_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.ttir", "triton_red_fused_mul_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.ttgir", "triton_red_fused_mul_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.llir", "triton_red_fused_mul_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.ptx", "triton_red_fused_mul_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.cubin", "triton_red_fused_mul_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.json"}}

progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.cubin ADDED Viewed

Binary file (15.5 kB). View file

progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"hash": "e712b37f65a8c1e8fb890da435259f275a07231a0bd189398077a745a7ed7616", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_mul_0"}

progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.llir ADDED Viewed

	@@ -0,0 +1,140 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %10 = shl i32 %9, 6, !dbg !8
+  %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %12 = and i32 %11, 252, !dbg !9
+  %13 = lshr exact i32 %12, 2, !dbg !9
+  %14 = or disjoint i32 %13, %10, !dbg !10
+  %15 = and i32 %11, 3, !dbg !11
+  %16 = sdiv i32 %14, 768, !dbg !12
+  %17 = shl i32 %14, 7, !dbg !13
+  %18 = shl i32 %14, 12
+  %19 = mul i32 %16, -3145600
+  %20 = add i32 %19, %18
+  %21 = zext nneg i32 %15 to i64, !dbg !14
+  %22 = sext i32 %17 to i64, !dbg !14
+  %invariant.gep = getelementptr bfloat, ptr addrspace(1) %1, i64 %22, !dbg !14
+  br label %23, !dbg !14
+23:                                               ; preds = %8, %23
+  %indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %23 ]
+  %24 = phi float [ 0.000000e+00, %8 ], [ %39, %23 ]
+  %25 = or disjoint i64 %indvars.iv, %21, !dbg !15
+  %26 = trunc nuw nsw i64 %25 to i32, !dbg !16
+  %27 = add i32 %20, %26, !dbg !16
+  %28 = sext i32 %27 to i64, !dbg !17
+  %29 = getelementptr bfloat, ptr addrspace(1) %0, i64 %28, !dbg !17
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !18
+  %31 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %29, i64 %30, i1 true) #4, !dbg !18
+  %32 = bitcast i16 %31 to bfloat, !dbg !18
+  %33 = fpext bfloat %32 to float, !dbg !19
+  %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %25, !dbg !20
+  %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !21
+  %35 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep, i64 %34, i1 true) #4, !dbg !21
+  %36 = bitcast i16 %35 to bfloat, !dbg !21
+  %37 = fpext bfloat %36 to float, !dbg !22
+  %38 = fmul float %33, %37, !dbg !23
+  %39 = fadd float %24, %38, !dbg !24
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !14
+  %40 = icmp samesign ult i64 %indvars.iv, 124, !dbg !14
+  br i1 %40, label %23, label %41, !dbg !14
+41:                                               ; preds = %23
+  %42 = and i32 %11, 63, !dbg !9
+  %43 = or disjoint i32 %10, %42, !dbg !10
+  %44 = bitcast float %39 to i32, !dbg !25
+  %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 2, i32 31), !dbg !25
+  %46 = bitcast i32 %45 to float, !dbg !25
+  %47 = fadd float %39, %46, !dbg !29
+  %48 = bitcast float %47 to i32, !dbg !25
+  %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 1, i32 31), !dbg !25
+  %50 = bitcast i32 %49 to float, !dbg !25
+  %51 = fadd float %47, %50, !dbg !29
+  %52 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %12, !dbg !30
+  store float %51, ptr addrspace(3) %52, align 4, !dbg !30
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
+  %53 = shl nuw nsw i32 %42, 2, !dbg !30
+  %54 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %53, !dbg !30
+  %55 = load float, ptr addrspace(3) %54, align 4, !dbg !30
+  %56 = sext i32 %43 to i64, !dbg !31
+  %57 = getelementptr float, ptr addrspace(1) %2, i64 %56, !dbg !31
+  %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32
+  %59 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l"(ptr addrspace(1) %57, i64 %58) #4, !dbg !32
+  %60 = bitcast i32 %59 to float, !dbg !32
+  %61 = fmul float %60, 0x3FE62E4300000000, !dbg !33
+  %62 = fmul float %61, 0x3FF7154760000000, !dbg !34
+  %63 = fsub float %55, %62, !dbg !30
+  %64 = getelementptr float, ptr addrspace(1) %3, i64 %56, !dbg !35
+  %65 = and i32 %11, 192, !dbg !36
+  %66 = icmp eq i32 %65, 0, !dbg !36
+  %67 = bitcast float %63 to i32, !dbg !36
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %67, ptr addrspace(1) %64, i1 %66) #4, !dbg !36
+  ret void, !dbg !37
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused_mul_0", linkageName: "triton_red_fused_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 39, column: 45, scope: !4)
+!14 = !DILocation(line: 32, column: 40, scope: !4)
+!15 = !DILocation(line: 33, column: 31, scope: !4)
+!16 = !DILocation(line: 38, column: 50, scope: !4)
+!17 = !DILocation(line: 38, column: 34, scope: !4)
+!18 = !DILocation(line: 38, column: 60, scope: !4)
+!19 = !DILocation(line: 38, column: 114, scope: !4)
+!20 = !DILocation(line: 39, column: 34, scope: !4)
+!21 = !DILocation(line: 39, column: 50, scope: !4)
+!22 = !DILocation(line: 39, column: 104, scope: !4)
+!23 = !DILocation(line: 40, column: 22, scope: !4)
+!24 = !DILocation(line: 42, column: 23, scope: !4)
+!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !28)
+!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0)
+!27 = !DIFile(filename: "standard.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language")
+!28 = !DILocation(line: 44, column: 25, scope: !4)
+!29 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !28)
+!30 = !DILocation(line: 51, column: 19, scope: !4)
+!31 = !DILocation(line: 45, column: 30, scope: !4)
+!32 = !DILocation(line: 45, column: 35, scope: !4)
+!33 = !DILocation(line: 48, column: 18, scope: !4)
+!34 = !DILocation(line: 50, column: 19, scope: !4)
+!35 = !DILocation(line: 52, column: 25, scope: !4)
+!36 = !DILocation(line: 52, column: 37, scope: !4)
+!37 = !DILocation(line: 52, column: 4, scope: !4)

progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.ptx ADDED Viewed

	@@ -0,0 +1,396 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.7
+.target sm_90a
+.address_size 64
+	// .globl	triton_red_fused_mul_0  // -- Begin function triton_red_fused_mul_0
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused_mul_0
+.visible .entry triton_red_fused_mul_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_3,
+	.param .u32 triton_red_fused_mul_0_param_4,
+	.param .u32 triton_red_fused_mul_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_6,
+	.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_7
+)
+.reqntid 256
+{
+	.reg .pred 	%p<5>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<44>;
+	.reg .b64 	%rd<30>;
+	.loc	1 18 0                          // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:18:0
+// %bb.0:
+	ld.param.b64 	%rd9, [triton_red_fused_mul_0_param_3];
+	ld.param.b64 	%rd8, [triton_red_fused_mul_0_param_2];
+	ld.param.b64 	%rd7, [triton_red_fused_mul_0_param_0];
+	ld.param.b64 	%rd11, [triton_red_fused_mul_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:23:28
+	mov.u32 	%r7, %ctaid.x;
+	.loc	1 23 33                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:23:33
+	shl.b32 	%r1, %r7, 6;
+	.loc	1 24 44                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:24:44
+	mov.u32 	%r2, %tid.x;
+	and.b32 	%r3, %r2, 252;
+	bfe.u32 	%r8, %r2, 2, 6;
+	.loc	1 24 23                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:24:23
+	or.b32 	%r9, %r8, %r1;
+	.loc	1 26 37                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:26:37
+	and.b32 	%r10, %r2, 3;
+	.loc	1 29 19                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:29:19
+	mul.hi.s32 	%r11, %r9, 715827883;
+	shr.u32 	%r12, %r11, 31;
+	shr.u32 	%r13, %r11, 7;
+	add.s32 	%r14, %r13, %r12;
+	.loc	1 32 40                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:32:40
+	cvt.u64.u32 	%rd12, %r10;
+	shl.b32 	%r15, %r7, 13;
+	shl.b32 	%r16, %r8, 7;
+	or.b32 	%r17, %r15, %r16;
+	cvt.s64.s32 	%rd13, %r17;
+	or.b64 	%rd14, %rd13, %rd12;
+	shl.b64 	%rd15, %rd14, 1;
+	add.s64 	%rd28, %rd11, %rd15;
+	shl.b32 	%r18, %r7, 18;
+	shl.b32 	%r19, %r8, 12;
+	or.b32 	%r20, %r18, %r19;
+	or.b32 	%r21, %r20, %r10;
+	mul.lo.s32 	%r22, %r14, 3145600;
+	sub.s32 	%r23, %r21, %r22;
+	cvt.u64.u32 	%rd2, %r23;
+	mov.b32 	%r43, 0f00000000;
+	mov.b64 	%rd29, -4;
+$L__BB0_1:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 38 34                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:38:34
+	add.s64 	%rd22, %rd2, %rd29;
+	cvt.u32.u64 	%r24, %rd22;
+	add.s32 	%r25, %r24, 4;
+	mad.wide.s32 	%rd17, %r25, 2, %rd7;
+	.loc	1 38 60                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:38:60
+	// begin inline asm
+	mov.u64 %rd16, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd16, 1.0;
+	// end inline asm
+	mov.b16 	%rs2, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u16 %rs1, %rs2;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs1 }, [ %rd17 + 0 ], %rd16;
+	// end inline asm
+	.loc	1 38 114                        // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:38:114
+	cvt.f32.bf16 	%r26, %rs1;
+	.loc	1 39 50                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:39:50
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs3, %rs2;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs3 }, [ %rd28 + 0 ], %rd19;
+	// end inline asm
+	.loc	1 39 104                        // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:39:104
+	cvt.f32.bf16 	%r27, %rs3;
+	.loc	1 42 23                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:42:23
+	fma.rn.f32 	%r43, %r26, %r27, %r43;
+	.loc	1 32 40                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:32:40
+	add.s64 	%rd29, %rd29, 4;
+	add.s64 	%rd28, %rd28, 8;
+	setp.lt.u64 	%p3, %rd29, 124;
+	@%p3 bra 	$L__BB0_1;
+// %bb.2:
+	.loc	1 24 44                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:24:44
+	and.b32 	%r30, %r2, 63;
+	.loc	1 24 23                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:24:23
+	or.b32 	%r31, %r1, %r30;
+$L__tmp1:
+	.loc	2 291 36                        // standard.py:291:36 @[ cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:44:25 ]
+	shfl.sync.bfly.b32 	%r32, %r43, 2, 31, -1;
+	.loc	2 261 15                        // standard.py:261:15 @[ cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:44:25 ]
+	add.f32 	%r33, %r43, %r32;
+	.loc	2 291 36                        // standard.py:291:36 @[ cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:44:25 ]
+	shfl.sync.bfly.b32 	%r34, %r33, 1, 31, -1;
+	.loc	2 261 15                        // standard.py:261:15 @[ cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:44:25 ]
+	add.f32 	%r35, %r33, %r34;
+$L__tmp2:
+	.loc	1 51 19                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:51:19
+	mov.b32 	%r36, global_smem;
+	add.s32 	%r37, %r36, %r3;
+	st.shared.b32 	[%r37], %r35;
+	bar.sync 	0;
+	shl.b32 	%r38, %r30, 2;
+	add.s32 	%r39, %r36, %r38;
+	ld.shared.b32 	%r40, [%r39];
+	.loc	1 45 30                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:45:30
+	mul.wide.s32 	%rd27, %r31, 4;
+	add.s64 	%rd24, %rd8, %rd27;
+	.loc	1 45 35                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:45:35
+	// begin inline asm
+	mov.u64 %rd25, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r28, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.b32 { %r28 }, [ %rd24 + 0 ], %rd25;
+	// end inline asm
+	.loc	1 48 18                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:48:18
+	mul.f32 	%r41, %r28, 0fBF317218;
+	.loc	1 51 19                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:51:19
+	fma.rn.f32 	%r29, %r41, 0f3FB8AA3B, %r40;
+	.loc	1 52 25                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:52:25
+	add.s64 	%rd26, %rd9, %rd27;
+	.loc	1 52 37                         // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:52:37
+	and.b32 	%r42, %r2, 192;
+	setp.eq.b32 	%p4, %r42, 0;
+	// begin inline asm
+	@%p4 st.global.b32 [ %rd26 + 0 ], { %r29 };
+	// end inline asm
+	.loc	1 52 4                          // cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py:52:4
+	ret;
+$L__tmp3:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py"
+	.file	2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 211                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xcc DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 114
+.b8 55
+.b8 121
+.b8 113
+.b8 98
+.b8 106
+.b8 121
+.b8 101
+.b8 106
+.b8 113
+.b8 102
+.b8 97
+.b8 50
+.b8 108
+.b8 100
+.b8 55
+.b8 51
+.b8 112
+.b8 113
+.b8 113
+.b8 114
+.b8 50
+.b8 116
+.b8 112
+.b8 55
+.b8 103
+.b8 54
+.b8 121
+.b8 98
+.b8 114
+.b8 104
+.b8 115
+.b8 52
+.b8 117
+.b8 51
+.b8 52
+.b8 120
+.b8 116
+.b8 53
+.b8 115
+.b8 52
+.b8 99
+.b8 117
+.b8 115
+.b8 110
+.b8 116
+.b8 111
+.b8 122
+.b8 101
+.b8 52
+.b8 97
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 119
+.b8 111
+.b8 114
+.b8 107
+.b8 115
+.b8 112
+.b8 97
+.b8 99
+.b8 101
+.b8 47
+.b8 104
+.b8 97
+.b8 110
+.b8 114
+.b8 117
+.b8 105
+.b8 47
+.b8 106
+.b8 117
+.b8 110
+.b8 113
+.b8 117
+.b8 97
+.b8 110
+.b8 47
+.b8 83
+.b8 112
+.b8 101
+.b8 99
+.b8 70
+.b8 111
+.b8 114
+.b8 103
+.b8 101
+.b8 47
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 107
+.b8 101
+.b8 114
+.b8 110
+.b8 101
+.b8 108
+.b8 115
+.b8 47
+.b8 114
+.b8 55
+.b8 0
+.b8 2                                   // Abbrev [2] 0x8f:0x19 DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0xa8:0x2e DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 143                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0xbd:0x18 DW_TAG_inlined_subroutine
+.b32 143                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp2                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}

progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.source ADDED Viewed

	@@ -0,0 +1,218 @@

+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":18:0)
+#loc45 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
+#loc47 = loc(unknown)
+#loc50 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
+#loc54 = loc("in_ptr0"(#loc))
+#loc55 = loc("in_ptr1"(#loc))
+#loc56 = loc("in_ptr2"(#loc))
+#loc57 = loc("out_ptr1"(#loc))
+#loc58 = loc("xnumel"(#loc))
+#loc59 = loc("r0_numel"(#loc))
+#loc100 = loc("input"(#loc45))
+#loc101 = loc("a"(#loc50))
+#loc102 = loc("b"(#loc50))
+module {
+  tt.func public @triton_red_fused_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 24576 : i32 loc(#loc60)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc61)
+    %xoffset = tt.get_program_id x : i32 loc(#loc62)
+    %xoffset_2 = arith.constant 64 : i32 loc(#loc63)
+    %xoffset_3 = arith.constant 64 : i32 loc(#loc63)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc63)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc64)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc65)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc66)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc66)
+    %xmask = arith.constant true loc(#loc67)
+    %xmask_8 = arith.constant dense<true> : tensor<64x4xi1> loc(#loc67)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc68)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc69)
+    %x0 = arith.constant 768 : i32 loc(#loc70)
+    %x0_10 = arith.constant 768 : i32 loc(#loc70)
+    %x0_11 = arith.constant dense<768> : tensor<64x1xi32> loc(#loc70)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc70)
+    %x1 = arith.constant 768 : i32 loc(#loc71)
+    %x1_13 = arith.constant 768 : i32 loc(#loc71)
+    %x1_14 = arith.constant dense<768> : tensor<64x1xi32> loc(#loc71)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc71)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc72)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16) -> (tensor<64x4xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc74)
+      %r0_index_24 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc74)
+      %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc75)
+      %r0_mask_25 = arith.cmpi slt, %r0_index_24, %r0_mask : tensor<1x4xi32> loc(#loc75)
+      %tmp0 = arith.constant 128 : i32 loc(#loc76)
+      %tmp0_26 = arith.constant 128 : i32 loc(#loc76)
+      %tmp0_27 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc76)
+      %tmp0_28 = arith.muli %tmp0_27, %x1_15 : tensor<64x1xi32> loc(#loc76)
+      %tmp0_29 = tt.broadcast %r0_index_24 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc77)
+      %tmp0_30 = tt.broadcast %tmp0_28 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc77)
+      %tmp0_31 = arith.addi %tmp0_29, %tmp0_30 : tensor<64x4xi32> loc(#loc77)
+      %tmp0_32 = arith.constant 4096 : i32 loc(#loc78)
+      %tmp0_33 = arith.constant 4096 : i32 loc(#loc78)
+      %tmp0_34 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc78)
+      %tmp0_35 = arith.muli %tmp0_34, %x0_12 : tensor<64x1xi32> loc(#loc78)
+      %tmp0_36 = tt.broadcast %tmp0_35 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc79)
+      %tmp0_37 = arith.addi %tmp0_31, %tmp0_36 : tensor<64x4xi32> loc(#loc79)
+      %tmp0_38 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc80)
+      %tmp0_39 = tt.addptr %tmp0_38, %tmp0_37 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc80)
+      %tmp0_40 = arith.constant 0.000000e+00 : f32 loc(#loc81)
+      %tmp0_41 = tt.broadcast %r0_mask_25 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc81)
+      %tmp0_42 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc81)
+      %tmp0_43 = arith.truncf %tmp0_42 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc81)
+      %tmp0_44 = tt.load %tmp0_39, %tmp0_41, %tmp0_43 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>> loc(#loc81)
+      %tmp0_45 = arith.extf %tmp0_44 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc82)
+      %tmp1 = arith.constant 128 : i32 loc(#loc83)
+      %tmp1_46 = arith.constant 128 : i32 loc(#loc83)
+      %tmp1_47 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc83)
+      %tmp1_48 = arith.muli %tmp1_47, %xindex_7 : tensor<64x1xi32> loc(#loc83)
+      %tmp1_49 = tt.broadcast %r0_index_24 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc84)
+      %tmp1_50 = tt.broadcast %tmp1_48 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc84)
+      %tmp1_51 = arith.addi %tmp1_49, %tmp1_50 : tensor<64x4xi32> loc(#loc84)
+      %tmp1_52 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc85)
+      %tmp1_53 = tt.addptr %tmp1_52, %tmp1_51 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc85)
+      %tmp1_54 = arith.constant 0.000000e+00 : f32 loc(#loc86)
+      %tmp1_55 = tt.broadcast %r0_mask_25 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc86)
+      %tmp1_56 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc86)
+      %tmp1_57 = arith.truncf %tmp1_56 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc86)
+      %tmp1_58 = tt.load %tmp1_53, %tmp1_55, %tmp1_57 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>> loc(#loc86)
+      %tmp1_59 = arith.extf %tmp1_58 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc87)
+      %tmp2 = arith.mulf %tmp0_45, %tmp1_59 : tensor<64x4xf32> loc(#loc88)
+      %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x4xf32> loc(#loc89)
+      %_tmp4_60 = tt.broadcast %r0_mask_25 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc90)
+      %_tmp4_61 = arith.select %_tmp4_60, %tmp5, %_tmp4_23 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc90)
+      scf.yield %_tmp4_61 : tensor<64x4xf32> loc(#loc32)
+    } loc(#loc73)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc91)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc92)
+    %tmp7 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc93)
+    %tmp7_19 = tt.addptr %tmp7, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc93)
+    %tmp7_20 = tt.load %tmp7_19 evictionPolicy = evict_last : tensor<64x1x!tt.ptr<f32>> loc(#loc94)
+    %tmp8 = arith.constant 0.693147182 : f32 loc(#loc95)
+    %tmp9 = arith.constant dense<0.693147182> : tensor<64x1xf32> loc(#loc96)
+    %tmp9_21 = arith.mulf %tmp7_20, %tmp9 : tensor<64x1xf32> loc(#loc96)
+    %tmp10 = arith.constant 1.44269502 : f32 loc(#loc97)
+    %tmp11 = arith.constant dense<1.44269502> : tensor<64x1xf32> loc(#loc98)
+    %tmp11_22 = arith.mulf %tmp9_21, %tmp11 : tensor<64x1xf32> loc(#loc98)
+    %tmp12 = arith.subf %tmp4_18, %tmp11_22 : tensor<64x1xf32> loc(#loc99)
+    %4 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc42)
+    %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc42)
+    tt.store %5, %tmp12 : tensor<64x1x!tt.ptr<f32>> loc(#loc43)
+    tt.return loc(#loc44)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x4xf32> loc("input"(#loc45))) -> tensor<64xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc46)
+      tt.reduce.return %2 : f32 loc(#loc46)
+    }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc46)
+    tt.return %0 : tensor<64xf32> loc(#loc48)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<64xf32> loc(#loc49)
+    tt.return %1 : tensor<64xf32> loc(#loc49)
+  } loc(#loc45)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc50)), %b: f32 loc("b"(#loc50))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc51)
+    tt.return %0 : f32 loc(#loc52)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc53)
+    tt.return %1 : f32 loc(#loc53)
+  } loc(#loc50)
+} loc(#loc)
+#loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":19:13)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":20:15)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":23:28)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":23:33)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":24:36)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":24:44)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":24:23)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":25:46)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":26:27)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":26:37)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":28:19)
+#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":29:19)
+#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":31:43)
+#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":32:40)
+#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":33:31)
+#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":34:29)
+#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:45)
+#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:41)
+#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:55)
+#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:50)
+#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:34)
+#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:60)
+#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:114)
+#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:45)
+#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:41)
+#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:34)
+#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:50)
+#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:104)
+#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":40:22)
+#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":42:23)
+#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":43:40)
+#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":43:8)
+#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":44:25)
+#loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":44:28)
+#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":45:30)
+#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":45:35)
+#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":47:11)
+#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":48:18)
+#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":49:12)
+#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":50:19)
+#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":51:19)
+#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":52:25)
+#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":52:37)
+#loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":52:4)
+#loc46 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
+#loc48 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
+#loc49 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
+#loc51 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
+#loc52 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
+#loc53 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
+#loc60 = loc("xnumel"(#loc1))
+#loc61 = loc("r0_numel"(#loc2))
+#loc62 = loc("xoffset"(#loc3))
+#loc63 = loc("xoffset"(#loc4))
+#loc64 = loc("xindex"(#loc5))
+#loc65 = loc("xindex"(#loc6))
+#loc66 = loc("xindex"(#loc7))
+#loc67 = loc("xmask"(#loc8))
+#loc68 = loc("r0_base"(#loc9))
+#loc69 = loc("r0_base"(#loc10))
+#loc70 = loc("x0"(#loc11))
+#loc71 = loc("x1"(#loc12))
+#loc72 = loc("_tmp4"(#loc13))
+#loc73 = loc("_tmp4"(#loc14))
+#loc74 = loc("r0_index"(#loc15))
+#loc75 = loc("r0_mask"(#loc16))
+#loc76 = loc("tmp0"(#loc17))
+#loc77 = loc("tmp0"(#loc18))
+#loc78 = loc("tmp0"(#loc19))
+#loc79 = loc("tmp0"(#loc20))
+#loc80 = loc("tmp0"(#loc21))
+#loc81 = loc("tmp0"(#loc22))
+#loc82 = loc("tmp0"(#loc23))
+#loc83 = loc("tmp1"(#loc24))
+#loc84 = loc("tmp1"(#loc25))
+#loc85 = loc("tmp1"(#loc26))
+#loc86 = loc("tmp1"(#loc27))
+#loc87 = loc("tmp1"(#loc28))
+#loc88 = loc("tmp2"(#loc29))
+#loc89 = loc("tmp5"(#loc30))
+#loc90 = loc("_tmp4"(#loc31))
+#loc91 = loc("tmp4"(#loc33))
+#loc92 = loc("tmp4"(#loc34))
+#loc93 = loc("tmp7"(#loc35))
+#loc94 = loc("tmp7"(#loc36))
+#loc95 = loc("tmp8"(#loc37))
+#loc96 = loc("tmp9"(#loc38))
+#loc97 = loc("tmp10"(#loc39))
+#loc98 = loc("tmp11"(#loc40))
+#loc99 = loc("tmp12"(#loc41))

progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.ttgir ADDED Viewed

	@@ -0,0 +1,158 @@

+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}>
+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":18:0)
+#loc1 = loc(unknown)
+#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":44:25)
+#loc40 = loc("in_ptr0"(#loc))
+#loc41 = loc("in_ptr1"(#loc))
+#loc42 = loc("in_ptr2"(#loc))
+#loc43 = loc("out_ptr1"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc71 = loc("tmp4"(#loc29))
+#loc79 = loc(callsite(#loc1 at #loc71))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.693147182> : tensor<64x1xf32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<1.44269502> : tensor<64x1xf32, #blocked> loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked1> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<1x4xi32, #blocked1> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked1> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %cst_4 = arith.constant dense<4096> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_5 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_6 = arith.constant dense<768> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc46)
+    %xoffset_7 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc47)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48)
+    %xindex_8 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48)
+    %xindex_9 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc48)
+    %xindex_10 = tt.expand_dims %xindex_8 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc48)
+    %xindex_11 = tt.splat %xoffset_7 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc49)
+    %xindex_12 = tt.splat %xoffset_7 : i32 -> tensor<64x1xi32, #blocked> loc(#loc49)
+    %xindex_13 = arith.addi %xindex_11, %xindex_9 : tensor<64x1xi32, #blocked1> loc(#loc49)
+    %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc49)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc50)
+    %r0_base_15 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x4xi32, #blocked1> loc(#loc50)
+    %x0 = arith.remsi %xindex_13, %cst_6 : tensor<64x1xi32, #blocked1> loc(#loc51)
+    %x1 = arith.divsi %xindex_13, %cst_6 : tensor<64x1xi32, #blocked1> loc(#loc52)
+    %tmp0 = arith.muli %x1, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc53)
+    %tmp0_16 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc54)
+    %tmp0_17 = arith.muli %x0, %cst_4 : tensor<64x1xi32, #blocked1> loc(#loc55)
+    %tmp0_18 = tt.broadcast %tmp0_17 : tensor<64x1xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc56)
+    %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc57)
+    %tmp1 = arith.muli %xindex_13, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc58)
+    %tmp1_20 = tt.broadcast %tmp1 : tensor<64x1xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc59)
+    %tmp1_21 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc60)
+    %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_26 = %cst_3) -> (tensor<64x4xf32, #blocked1>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked1> loc(#loc62)
+      %r0_index_27 = arith.addi %r0_index, %r0_base_15 : tensor<1x4xi32, #blocked1> loc(#loc62)
+      %r0_mask = arith.cmpi slt, %r0_index_27, %cst_2 : tensor<1x4xi32, #blocked1> loc(#loc63)
+      %tmp0_28 = tt.broadcast %r0_index_27 : tensor<1x4xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc54)
+      %tmp0_29 = arith.addi %tmp0_28, %tmp0_16 : tensor<64x4xi32, #blocked1> loc(#loc54)
+      %tmp0_30 = arith.addi %tmp0_29, %tmp0_18 : tensor<64x4xi32, #blocked1> loc(#loc56)
+      %tmp0_31 = tt.addptr %tmp0_19, %tmp0_30 : tensor<64x4x!tt.ptr<bf16>, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc57)
+      %tmp0_32 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked1> -> tensor<64x4xi1, #blocked1> loc(#loc64)
+      %tmp0_33 = tt.load %tmp0_31, %tmp0_32, %cst_1 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc64)
+      %tmp0_34 = arith.extf %tmp0_33 : tensor<64x4xbf16, #blocked1> to tensor<64x4xf32, #blocked1> loc(#loc65)
+      %tmp1_35 = arith.addi %tmp0_28, %tmp1_20 : tensor<64x4xi32, #blocked1> loc(#loc59)
+      %tmp1_36 = tt.addptr %tmp1_21, %tmp1_35 : tensor<64x4x!tt.ptr<bf16>, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc60)
+      %tmp1_37 = tt.load %tmp1_36, %tmp0_32, %cst_1 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc66)
+      %tmp1_38 = arith.extf %tmp1_37 : tensor<64x4xbf16, #blocked1> to tensor<64x4xf32, #blocked1> loc(#loc67)
+      %tmp2 = arith.mulf %tmp0_34, %tmp1_38 : tensor<64x4xf32, #blocked1> loc(#loc68)
+      %tmp5 = arith.addf %_tmp4_26, %tmp2 : tensor<64x4xf32, #blocked1> loc(#loc69)
+      %_tmp4_39 = arith.select %tmp0_32, %tmp5, %_tmp4_26 : tensor<64x4xi1, #blocked1>, tensor<64x4xf32, #blocked1> loc(#loc70)
+      scf.yield %_tmp4_39 : tensor<64x4xf32, #blocked1> loc(#loc27)
+    } loc(#loc61)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_26: f32 loc(callsite(#loc1 at #loc71)), %tmp4_27: f32 loc(callsite(#loc1 at #loc71))):
+      %tmp4_28 = arith.addf %tmp4_26, %tmp4_27 : f32 loc(#loc80)
+      tt.reduce.return %tmp4_28 : f32 loc(#loc78)
+    }) : (tensor<64x4xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc78)
+    %tmp12 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc72)
+    %tmp4_22 = tt.expand_dims %tmp12 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xf32, #blocked> loc(#loc73)
+    %tmp7 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked> loc(#loc74)
+    %tmp7_23 = tt.addptr %tmp7, %xindex_14 : tensor<64x1x!tt.ptr<f32>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc74)
+    %tmp7_24 = tt.load %tmp7_23 evictionPolicy = evict_last : tensor<64x1x!tt.ptr<f32>, #blocked> loc(#loc75)
+    %tmp9 = arith.mulf %tmp7_24, %cst : tensor<64x1xf32, #blocked> loc(#loc76)
+    %tmp11 = arith.mulf %tmp9, %cst_0 : tensor<64x1xf32, #blocked> loc(#loc77)
+    %tmp12_25 = arith.subf %tmp4_22, %tmp11 : tensor<64x1xf32, #blocked> loc(#loc72)
+    %0 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked> loc(#loc37)
+    %1 = tt.addptr %0, %xindex_14 : tensor<64x1x!tt.ptr<f32>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc37)
+    tt.store %1, %tmp12_25 : tensor<64x1x!tt.ptr<f32>, #blocked> loc(#loc38)
+    tt.return loc(#loc39)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":23:28)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":23:33)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":24:44)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":24:23)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":26:37)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":28:19)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":29:19)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:45)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:41)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:55)
+#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:50)
+#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:34)
+#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:45)
+#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:41)
+#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:34)
+#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":32:40)
+#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":33:31)
+#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":34:29)
+#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:60)
+#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:114)
+#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:50)
+#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:104)
+#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":40:22)
+#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":42:23)
+#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":43:40)
+#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":43:8)
+#loc28 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
+#loc30 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
+#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":51:19)
+#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":44:28)
+#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":45:30)
+#loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":45:35)
+#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":48:18)
+#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":50:19)
+#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":52:25)
+#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":52:37)
+#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":52:4)
+#loc46 = loc("xoffset"(#loc2))
+#loc47 = loc("xoffset"(#loc3))
+#loc48 = loc("xindex"(#loc4))
+#loc49 = loc("xindex"(#loc5))
+#loc50 = loc("r0_base"(#loc6))
+#loc51 = loc("x0"(#loc7))
+#loc52 = loc("x1"(#loc8))
+#loc53 = loc("tmp0"(#loc9))
+#loc54 = loc("tmp0"(#loc10))
+#loc55 = loc("tmp0"(#loc11))
+#loc56 = loc("tmp0"(#loc12))
+#loc57 = loc("tmp0"(#loc13))
+#loc58 = loc("tmp1"(#loc14))
+#loc59 = loc("tmp1"(#loc15))
+#loc60 = loc("tmp1"(#loc16))
+#loc61 = loc("_tmp4"(#loc17))
+#loc62 = loc("r0_index"(#loc18))
+#loc63 = loc("r0_mask"(#loc19))
+#loc64 = loc("tmp0"(#loc20))
+#loc65 = loc("tmp0"(#loc21))
+#loc66 = loc("tmp1"(#loc22))
+#loc67 = loc("tmp1"(#loc23))
+#loc68 = loc("tmp2"(#loc24))
+#loc69 = loc("tmp5"(#loc25))
+#loc70 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp12"(#loc31))
+#loc73 = loc("tmp4"(#loc32))
+#loc74 = loc("tmp7"(#loc33))
+#loc75 = loc("tmp7"(#loc34))
+#loc76 = loc("tmp9"(#loc35))
+#loc77 = loc("tmp11"(#loc36))
+#loc78 = loc(callsite(#loc28 at #loc71))
+#loc80 = loc(callsite(#loc30 at #loc78))

progress/github/SpecForge/cache/compiled_kernels/triton/1/44JLG73FVDA6R64JBWSDKJM7E5NAOIY2BPIYSOMAO6TULJ7NOYLA/triton_red_fused_mul_0.ttir ADDED Viewed

	@@ -0,0 +1,155 @@

+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":18:0)
+#loc1 = loc(unknown)
+#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":44:25)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("in_ptr1"(#loc))
+#loc44 = loc("in_ptr2"(#loc))
+#loc45 = loc("out_ptr1"(#loc))
+#loc46 = loc("xnumel"(#loc))
+#loc47 = loc("r0_numel"(#loc))
+#loc77 = loc("tmp4"(#loc33))
+#loc83 = loc(callsite(#loc1 at #loc77))
+module {
+  tt.func public @triton_red_fused_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16> loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc2)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc2)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc2)
+    %tmp11 = arith.constant dense<1.44269502> : tensor<64x1xf32> loc(#loc48)
+    %tmp9 = arith.constant dense<0.693147182> : tensor<64x1xf32> loc(#loc49)
+    %cst_0 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<1x4xi32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<768> : tensor<64x1xi32> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc50)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc51)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc52)
+    %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc53)
+    %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc54)
+    %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc54)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc55)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc56)
+    %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc57)
+    %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc58)
+    %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_15 = %cst_3) -> (tensor<64x4xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc60)
+      %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc60)
+      %r0_mask = arith.cmpi slt, %r0_index_16, %cst_2 : tensor<1x4xi32> loc(#loc61)
+      %tmp0 = arith.muli %x1, %cst_1 : tensor<64x1xi32> loc(#loc62)
+      %tmp0_17 = tt.broadcast %r0_index_16 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc63)
+      %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc63)
+      %tmp0_19 = arith.addi %tmp0_17, %tmp0_18 : tensor<64x4xi32> loc(#loc63)
+      %tmp0_20 = arith.muli %x0, %cst_0 : tensor<64x1xi32> loc(#loc64)
+      %tmp0_21 = tt.broadcast %tmp0_20 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc65)
+      %tmp0_22 = arith.addi %tmp0_19, %tmp0_21 : tensor<64x4xi32> loc(#loc65)
+      %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc66)
+      %tmp0_25 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc67)
+      %tmp0_26 = tt.load %tmp0_24, %tmp0_25, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_27 = arith.extf %tmp0_26 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc68)
+      %tmp1 = arith.muli %xindex_8, %cst_1 : tensor<64x1xi32> loc(#loc69)
+      %tmp1_28 = tt.broadcast %tmp1 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc70)
+      %tmp1_29 = arith.addi %tmp0_17, %tmp1_28 : tensor<64x4xi32> loc(#loc70)
+      %tmp1_30 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc71)
+      %tmp1_31 = tt.addptr %tmp1_30, %tmp1_29 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc71)
+      %tmp1_32 = tt.load %tmp1_31, %tmp0_25, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>> loc(#loc72)
+      %tmp1_33 = arith.extf %tmp1_32 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc73)
+      %tmp2 = arith.mulf %tmp0_27, %tmp1_33 : tensor<64x4xf32> loc(#loc74)
+      %tmp5 = arith.addf %_tmp4_15, %tmp2 : tensor<64x4xf32> loc(#loc75)
+      %_tmp4_34 = arith.select %tmp0_25, %tmp5, %_tmp4_15 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc76)
+      scf.yield %_tmp4_34 : tensor<64x4xf32> loc(#loc31)
+    } loc(#loc59)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_15: f32 loc(callsite(#loc1 at #loc77)), %tmp4_16: f32 loc(callsite(#loc1 at #loc77))):
+      %tmp4_17 = arith.addf %tmp4_15, %tmp4_16 : f32 loc(#loc84)
+      tt.reduce.return %tmp4_17 : f32 loc(#loc82)
+    }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc82)
+    %tmp4_10 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc78)
+    %tmp7 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc79)
+    %tmp7_11 = tt.addptr %tmp7, %xindex_8 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc79)
+    %tmp7_12 = tt.load %tmp7_11 evictionPolicy = evict_last : tensor<64x1x!tt.ptr<f32>> loc(#loc80)
+    %tmp9_13 = arith.mulf %tmp7_12, %tmp9 : tensor<64x1xf32> loc(#loc49)
+    %tmp11_14 = arith.mulf %tmp9_13, %tmp11 : tensor<64x1xf32> loc(#loc48)
+    %tmp12 = arith.subf %tmp4_10, %tmp11_14 : tensor<64x1xf32> loc(#loc81)
+    %0 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc39)
+    %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc39)
+    tt.store %1, %tmp12 : tensor<64x1x!tt.ptr<f32>> loc(#loc40)
+    tt.return loc(#loc41)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":32:40)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":50:19)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":48:18)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":23:28)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":23:33)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":24:36)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":24:44)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":24:23)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":26:27)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":26:37)
+#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":28:19)
+#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":29:19)
+#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":33:31)
+#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":34:29)
+#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:45)
+#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:41)
+#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:55)
+#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:50)
+#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:34)
+#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:60)
+#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":38:114)
+#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:45)
+#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:41)
+#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:34)
+#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:50)
+#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":39:104)
+#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":40:22)
+#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":42:23)
+#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":43:40)
+#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":43:8)
+#loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
+#loc34 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
+#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":44:28)
+#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":45:30)
+#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":45:35)
+#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":51:19)
+#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":52:25)
+#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":52:37)
+#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r7/cr7yqbjyejqfa2ld73pqqr2tp7g6ybrhs4u34xt5s4cusntoze4a.py":52:4)
+#loc48 = loc("tmp11"(#loc3))
+#loc49 = loc("tmp9"(#loc4))
+#loc50 = loc("xoffset"(#loc5))
+#loc51 = loc("xoffset"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xindex"(#loc8))
+#loc54 = loc("xindex"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("r0_base"(#loc11))
+#loc57 = loc("x0"(#loc12))
+#loc58 = loc("x1"(#loc13))
+#loc59 = loc("_tmp4"(#loc2))
+#loc60 = loc("r0_index"(#loc14))
+#loc61 = loc("r0_mask"(#loc15))
+#loc62 = loc("tmp0"(#loc16))
+#loc63 = loc("tmp0"(#loc17))
+#loc64 = loc("tmp0"(#loc18))
+#loc65 = loc("tmp0"(#loc19))
+#loc66 = loc("tmp0"(#loc20))
+#loc67 = loc("tmp0"(#loc21))
+#loc68 = loc("tmp0"(#loc22))
+#loc69 = loc("tmp1"(#loc23))
+#loc70 = loc("tmp1"(#loc24))
+#loc71 = loc("tmp1"(#loc25))
+#loc72 = loc("tmp1"(#loc26))
+#loc73 = loc("tmp1"(#loc27))
+#loc74 = loc("tmp2"(#loc28))
+#loc75 = loc("tmp5"(#loc29))
+#loc76 = loc("_tmp4"(#loc30))
+#loc78 = loc("tmp4"(#loc35))
+#loc79 = loc("tmp7"(#loc36))
+#loc80 = loc("tmp7"(#loc37))
+#loc81 = loc("tmp12"(#loc38))
+#loc82 = loc(callsite(#loc32 at #loc77))
+#loc84 = loc(callsite(#loc34 at #loc82))

progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/__grp__triton_tem_fused_0.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json"}}

progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"hash": "ef65677dccb0fd1ca33e2efd85dd27b554735f6893116cae461084f5b56323fe", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 196608, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"}

progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ptx ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.source ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttgir ADDED Viewed

The diff for this file is too large to render. See raw diff

progress/github/SpecForge/cache/compiled_kernels/triton/1/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttir ADDED Viewed

	@@ -0,0 +1,896 @@

+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":18:0)
+#loc1 = loc(unknown)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":200:41)
+#loc66 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":568:16)
+#loc111 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":449:51)
+#loc123 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":462:34)
+#loc167 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":235:45)
+#loc193 = loc("arg_Q"(#loc))
+#loc194 = loc("arg_K"(#loc))
+#loc195 = loc("arg_V"(#loc))
+#loc196 = loc("arg_M"(#loc))
+#loc197 = loc("arg_L"(#loc))
+#loc198 = loc("arg_KV_NUM_BLKS"(#loc))
+#loc199 = loc("arg_KV_IDX"(#loc))
+#loc200 = loc("arg_FULL_KV_NUM_BLKS"(#loc))
+#loc201 = loc("arg_FULL_KV_IDX"(#loc))
+#loc202 = loc("out_ptr0"(#loc))
+#loc203 = loc("ks0"(#loc))
+#loc204 = loc("ks1"(#loc))
+#loc255 = loc(callsite(#loc66 at #loc2))
+#loc296 = loc("m_ij"(#loc111))
+#loc306 = loc("l_i"(#loc123))
+#loc346 = loc(callsite(#loc66 at #loc167))
+#loc406 = loc(callsite(#loc296 at #loc255))
+#loc416 = loc(callsite(#loc306 at #loc255))
+#loc435 = loc(callsite(#loc296 at #loc346))
+#loc445 = loc(callsite(#loc306 at #loc346))
+#loc465 = loc(callsite(#loc1 at #loc406))
+#loc467 = loc(callsite(#loc1 at #loc416))
+#loc495 = loc(callsite(#loc1 at #loc435))
+#loc497 = loc(callsite(#loc1 at #loc445))
+module {
+  tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_M: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_M"(#loc)), %arg_L: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_L"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0> : tensor<1x128xi64> loc(#loc1)
+    %cst_0 = arith.constant dense<1024> : tensor<64x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1)
+    %cst_2 = arith.constant dense<16> : tensor<1x64xi32> loc(#loc205)
+    %cst_3 = arith.constant dense<16> : tensor<512x1xi32> loc(#loc205)
+    %cst_4 = arith.constant dense<0xFF800000> : tensor<512xf32> loc(#loc1)
+    %cst_5 = arith.constant dense<1.44269502> : tensor<512x64xf32> loc(#loc1)
+    %cst_6 = arith.constant dense<false> : tensor<512x64xi1> loc(#loc205)
+    %cst_7 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc205)
+    %cst_8 = arith.constant dense<1> : tensor<512x1xi32> loc(#loc205)
+    %cst_9 = arith.constant dense<0> : tensor<512x1xi32> loc(#loc205)
+    %cst_10 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc205)
+    %cst_11 = arith.constant dense<0xFF800000> : tensor<512x64xf32> loc(#loc1)
+    %cst_12 = arith.constant dense<0.0883883461> : tensor<512x64xf32> loc(#loc1)
+    %cst_13 = arith.constant dense<0.000000e+00> : tensor<512x64xf32> loc(#loc1)
+    %acc = arith.constant dense<0.000000e+00> : tensor<512x128xf32> loc(#loc360)
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1)
+    %c63_i32 = arith.constant 63 : i32 loc(#loc1)
+    %c31_i32 = arith.constant 31 : i32 loc(#loc1)
+    %cst_15 = arith.constant dense<128> : tensor<1x128x1xi32> loc(#loc1)
+    %mask = arith.constant dense<128> : tensor<1x1x128xi32> loc(#loc207)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %c2_i32 = arith.constant 2 : i32 loc(#loc1)
+    %q_range = arith.constant dense<4096> : tensor<1x128x1xi32> loc(#loc208)
+    %cst_16 = arith.constant dense<128> : tensor<4x1x1xi32> loc(#loc1)
+    %true = arith.constant true loc(#loc7)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %HKV = arith.constant 8 : i32 loc(#loc209)
+    %c32_i32 = arith.constant 32 : i32 loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %c1_i32 = arith.constant 1 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc9)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc10)
+    %1 = arith.muli %ks0, %c1024_i32 : i32 loc(#loc11)
+    %2 = arith.muli %ks0, %c32_i32 : i32 loc(#loc12)
+    %TILE_KV_OG = arith.addi %ks1, %c31_i32 : i32 loc(#loc361)
+    %TILE_KV_OG_17 = arith.divsi %TILE_KV_OG, %c32_i32 : i32 loc(#loc362)
+    %TILE_KV = arith.addi %TILE_KV_OG_17, %c63_i32 : i32 loc(#loc363)
+    %TILE_KV_18 = arith.divsi %TILE_KV, %c64_i32 : i32 loc(#loc364)
+    %TILE_KV_19 = arith.muli %TILE_KV_18, %c64_i32 : i32 loc(#loc212)
+    %3 = arith.divsi %TILE_KV_19, %c64_i32 : i32 loc(#loc18)
+    %off_z = tt.get_program_id x : i32 loc(#loc213)
+    %off_z_20 = arith.divsi %off_z, %HKV : i32 loc(#loc214)
+    %off_hkv = arith.remsi %off_z, %HKV : i32 loc(#loc215)
+    %off_t = tt.get_program_id y : i32 loc(#loc216)
+    %q_offset = arith.muli %off_z_20, %0 : i32 loc(#loc217)
+    %q_offset_21 = arith.muli %off_hkv, %c512_i32 : i32 loc(#loc218)
+    %q_offset_22 = arith.addi %q_offset, %q_offset_21 : i32 loc(#loc219)
+    %k_offset = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc220)
+    %K = tt.addptr %arg_K, %k_offset : !tt.ptr<bf16>, i32 loc(#loc221)
+    %V = tt.addptr %arg_V, %k_offset : !tt.ptr<bf16>, i32 loc(#loc222)
+    tt.assert %true, "" : i1 loc(#loc7)
+    %off_g = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc223)
+    %off_m = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc224)
+    %offs_m = tt.expand_dims %off_m {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc225)
+    %offs_m_23 = tt.broadcast %offs_m : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc226)
+    %offs_m_24 = tt.reshape %offs_m_23 : tensor<4x128xi32> -> tensor<512xi32> loc(#loc365)
+    %block_n_start = arith.muli %off_t, %3 : i32 loc(#loc228)
+    %block_n_end = arith.addi %block_n_start, %3 : i32 loc(#loc229)
+    %q_range_25 = tt.expand_dims %off_g {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc230)
+    %q_range_26 = tt.expand_dims %q_range_25 {axis = 2 : i32} : tensor<4x1xi32> -> tensor<4x1x1xi32> loc(#loc230)
+    %q_range_27 = arith.muli %q_range_26, %cst_16 : tensor<4x1x1xi32> loc(#loc231)
+    %q_range_28 = tt.expand_dims %offs_m {axis = 2 : i32} : tensor<1x128xi32> -> tensor<1x128x1xi32> loc(#loc232)
+    %q_range_29 = arith.muli %q_range_28, %q_range : tensor<1x128x1xi32> loc(#loc208)
+    %q_range_30 = tt.broadcast %q_range_27 : tensor<4x1x1xi32> -> tensor<4x128x1xi32> loc(#loc233)
+    %q_range_31 = tt.broadcast %q_range_29 : tensor<1x128x1xi32> -> tensor<4x128x1xi32> loc(#loc233)
+    %q_range_32 = arith.addi %q_range_30, %q_range_31 : tensor<4x128x1xi32> loc(#loc233)
+    %q_range_33 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<1x128xi32> -> tensor<1x1x128xi32> loc(#loc234)
+    %q_range_34 = tt.broadcast %q_range_32 : tensor<4x128x1xi32> -> tensor<4x128x128xi32> loc(#loc235)
+    %q_range_35 = tt.broadcast %q_range_33 : tensor<1x1x128xi32> -> tensor<4x128x128xi32> loc(#loc235)
+    %q_range_36 = arith.addi %q_range_34, %q_range_35 : tensor<4x128x128xi32> loc(#loc235)
+    %q = tt.splat %ks0 : i32 -> tensor<1x128x1xi32> loc(#loc236)
+    %q_37 = arith.cmpi slt, %q_range_28, %q : tensor<1x128x1xi32> loc(#loc236)
+    %q_38 = tt.addptr %arg_Q, %q_offset_22 : !tt.ptr<bf16>, i32 loc(#loc237)
+    %q_39 = tt.splat %q_38 : !tt.ptr<bf16> -> tensor<4x128x128x!tt.ptr<bf16>> loc(#loc238)
+    %q_40 = tt.addptr %q_39, %q_range_36 : tensor<4x128x128x!tt.ptr<bf16>>, tensor<4x128x128xi32> loc(#loc238)
+    %q_41 = tt.broadcast %q_37 : tensor<1x128x1xi1> -> tensor<4x128x128xi1> loc(#loc239)
+    %q_42 = tt.load %q_40, %q_41 : tensor<4x128x128x!tt.ptr<bf16>> loc(#loc239)
+    %q_43 = tt.reshape %q_42 : tensor<4x128x128xbf16> -> tensor<512x128xbf16> loc(#loc240)
+    %kv_num_blocks = tt.load %arg_KV_NUM_BLKS : !tt.ptr<i32> loc(#loc241)
+    %off_n_block_in_sparse = arith.remsi %block_n_start, %c2_i32 : i32 loc(#loc242)
+    %off_n = tt.load %arg_KV_IDX : !tt.ptr<i32> loc(#loc243)
+    %off_n_44 = arith.muli %off_n, %c128_i32 : i32 loc(#loc244)
+    %off_n_45 = arith.muli %off_n_block_in_sparse, %c64_i32 : i32 loc(#loc245)
+    %off_n_46 = arith.addi %off_n_44, %off_n_45 : i32 loc(#loc246)
+    %block_n_last_valid = arith.muli %kv_num_blocks, %c2_i32 : i32 loc(#loc247)
+    %block_n_last_valid_47 = arith.addi %ks1, %c63_i32 : i32 loc(#loc366)
+    %block_n_last_valid_48 = arith.divsi %block_n_last_valid_47, %c64_i32 : i32 loc(#loc367)
+    %block_n_last_valid_49 = arith.maxsi %block_n_last_valid_48, %c1_i32 : i32 loc(#loc249)
+    %block_n_last_valid_50 = arith.minsi %block_n_last_valid, %block_n_last_valid_49 : i32 loc(#loc250)
+    %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc251)
+    %offs_n_51 = tt.splat %off_n_46 : i32 -> tensor<64xi32> loc(#loc252)
+    %offs_n_52 = arith.addi %offs_n, %offs_n_51 : tensor<64xi32> loc(#loc252)
+    %4 = tt.expand_dims %offs_m_24 {axis = 1 : i32} : tensor<512xi32> -> tensor<512x1xi32> loc(#loc60)
+    %5 = tt.expand_dims %offs_n_52 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc61)
+    %6 = arith.cmpi sle, %block_n_end, %block_n_last_valid_50 : i32 loc(#loc62)
+    %7 = arith.select %6, %block_n_end, %block_n_last_valid_50 : i32 loc(#loc63)
+    %kv_offset:5 = scf.for %start_n = %block_n_start to %7 step %c1_i32 iter_args(%acc_78 = %acc, %l_i_79 = %cst_14, %m_i_80 = %cst_4, %offs_n_81 = %5, %kv_offset_82 = %c0_i32) -> (tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32)  : i32 {
+      %kv_base_offset = arith.addi %off_n_46, %kv_offset_82 : i32 loc(#loc369)
+      %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc370)
+      %offs_n_load_83 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc370)
+      %ptr = tt.expand_dims %offs_n_load_83 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc456)
+      %ptr_84 = arith.muli %ptr, %cst_0 : tensor<64x1xi32> loc(#loc457)
+      %ptr_85 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc458)
+      %ptr_86 = tt.addptr %ptr_85, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc458)
+      %ptr_87 = tt.broadcast %ptr_86 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc459)
+      %ptr_88 = tt.broadcast %offs_m : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc459)
+      %ptr_89 = tt.addptr %ptr_87, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc459)
+      %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc460)
+      %k_90 = arith.cmpi slt, %ptr, %k : tensor<64x1xi32> loc(#loc460)
+      %k_91 = tt.broadcast %k_90 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc461)
+      %k_92 = tt.load %ptr_89, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc461)
+      %k_93 = tt.trans %k_92 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc372)
+      %qk = tt.dot %q_43, %k_93, %cst_13, inputPrecision = tf32 : tensor<512x128xbf16> * tensor<128x64xbf16> -> tensor<512x64xf32> loc(#loc373)
+      %qk_94 = arith.mulf %qk, %cst_12 : tensor<512x64xf32> loc(#loc374)
+      %m = tt.splat %ks0 : i32 -> tensor<512x1xi32> loc(#loc462)
+      %m_95 = arith.remsi %4, %m : tensor<512x1xi32> loc(#loc462)
+      %n = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc463)
+      %n_96 = arith.remsi %offs_n_81, %n : tensor<1x64xi32> loc(#loc463)
+      %post_mod_scores = arith.cmpi slt, %offs_n_81, %n : tensor<1x64xi32> loc(#loc377)
+      %post_mod_scores_97 = tt.broadcast %post_mod_scores : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc378)
+      %post_mod_scores_98 = arith.select %post_mod_scores_97, %qk_94, %cst_11 : tensor<512x64xi1>, tensor<512x64xf32> loc(#loc378)
+      %tmp3 = arith.cmpi slt, %m_95, %cst_9 : tensor<512x1xi32> loc(#loc379)
+      %tmp5 = tt.broadcast %n_96 : tensor<1x64xi32> -> tensor<512x64xi32> loc(#loc380)
+      %tmp5_99 = tt.broadcast %m_95 : tensor<512x1xi32> -> tensor<512x64xi32> loc(#loc380)
+      %tmp5_100 = arith.cmpi sle, %tmp5, %tmp5_99 : tensor<512x64xi32> loc(#loc380)
+      %tmp6 = tt.broadcast %tmp3 : tensor<512x1xi1> -> tensor<512x64xi1> loc(#loc381)
+      %tmp6_101 = arith.andi %tmp6, %tmp5_100 : tensor<512x64xi1> loc(#loc381)
+      %tmp7 = arith.cmpi sge, %m_95, %cst_9 : tensor<512x1xi32> loc(#loc382)
+      %tmp8 = arith.cmpi slt, %n_96, %cst_10 : tensor<1x64xi32> loc(#loc383)
+      %tmp9 = tt.broadcast %tmp7 : tensor<512x1xi1> -> tensor<512x64xi1> loc(#loc384)
+      %tmp9_102 = tt.broadcast %tmp8 : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc384)
+      %tmp9_103 = arith.andi %tmp9, %tmp9_102 : tensor<512x64xi1> loc(#loc384)
+      %tmp10 = arith.extui %tmp8 : tensor<1x64xi1> to tensor<1x64xi32> loc(#loc385)
+      %tmp10_104 = arith.cmpi eq, %tmp10, %cst_10 : tensor<1x64xi32> loc(#loc385)
+      %tmp11 = tt.broadcast %tmp10_104 : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc386)
+      %tmp11_105 = arith.andi %tmp9, %tmp11 : tensor<512x64xi1> loc(#loc386)
+      %tmp14 = arith.remsi %m_95, %cst_3 : tensor<512x1xi32> loc(#loc387)
+      %tmp14_106 = arith.cmpi ne, %tmp14, %cst_9 : tensor<512x1xi32> loc(#loc388)
+      %tmp14_107 = arith.divsi %m_95, %cst_3 : tensor<512x1xi32> loc(#loc389)
+      %tmp14_108 = arith.subi %tmp14_107, %cst_8 : tensor<512x1xi32> loc(#loc390)
+      %tmp14_109 = arith.select %tmp14_106, %tmp14_108, %tmp14_107 : tensor<512x1xi1>, tensor<512x1xi32> loc(#loc391)
+      %tmp14_110 = arith.select %tmp3, %tmp14_109, %tmp14_107 : tensor<512x1xi1>, tensor<512x1xi32> loc(#loc392)
+      %tmp16 = arith.remsi %n_96, %cst_2 : tensor<1x64xi32> loc(#loc393)
+      %tmp16_111 = arith.cmpi ne, %tmp16, %cst_10 : tensor<1x64xi32> loc(#loc394)
+      %tmp16_112 = arith.divsi %n_96, %cst_2 : tensor<1x64xi32> loc(#loc395)
+      %tmp16_113 = arith.subi %tmp16_112, %cst_7 : tensor<1x64xi32> loc(#loc396)
+      %tmp16_114 = arith.select %tmp16_111, %tmp16_113, %tmp16_112 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc397)
+      %tmp16_115 = arith.select %tmp8, %tmp16_114, %tmp16_112 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc398)
+      %tmp17 = tt.broadcast %tmp14_110 : tensor<512x1xi32> -> tensor<512x64xi32> loc(#loc399)
+      %tmp17_116 = tt.broadcast %tmp16_115 : tensor<1x64xi32> -> tensor<512x64xi32> loc(#loc399)
+      %tmp17_117 = arith.cmpi eq, %tmp17, %tmp17_116 : tensor<512x64xi32> loc(#loc399)
+      %tmp18 = arith.andi %tmp11_105, %tmp17_117 : tensor<512x64xi1> loc(#loc400)
+      %tmp19 = arith.ori %tmp9_103, %tmp18 : tensor<512x64xi1> loc(#loc401)
+      %tmp20 = arith.ori %tmp6_101, %tmp19 : tensor<512x64xi1> loc(#loc402)
+      %mask_mod_output = arith.select %post_mod_scores_97, %tmp20, %cst_6 : tensor<512x64xi1>, tensor<512x64xi1> loc(#loc403)
+      %post_mod_scores_118 = arith.select %mask_mod_output, %post_mod_scores_98, %cst_11 : tensor<512x64xi1>, tensor<512x64xf32> loc(#loc404)
+      %post_mod_scores_119 = arith.mulf %post_mod_scores_118, %cst_5 : tensor<512x64xf32> loc(#loc405)
+      %m_ij = "tt.reduce"(%post_mod_scores_119) <{axis = 1 : i32}> ({
+      ^bb0(%m_ij_152: f32 loc(callsite(#loc1 at #loc406)), %m_ij_153: f32 loc(callsite(#loc1 at #loc406))):
+        %m_ij_154 = arith.maxnumf %m_ij_152, %m_ij_153 : f32 loc(#loc519)
+        tt.reduce.return %m_ij_154 : f32 loc(#loc464)
+      }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc464)
+      %m_ij_120 = arith.maxnumf %m_i_80, %m_ij : tensor<512xf32> loc(#loc407)
+      %masked_out_rows = arith.cmpf oeq, %m_ij_120, %cst_4 : tensor<512xf32> loc(#loc408)
+      %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_120 : tensor<512xi1>, tensor<512xf32> loc(#loc409)
+      %alpha = arith.subf %m_i_80, %m_ij_masked : tensor<512xf32> loc(#loc410)
+      %alpha_121 = math.exp2 %alpha : tensor<512xf32> loc(#loc411)
+      %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc412)
+      %p_122 = tt.broadcast %p : tensor<512x1xf32> -> tensor<512x64xf32> loc(#loc413)
+      %p_123 = arith.subf %post_mod_scores_119, %p_122 : tensor<512x64xf32> loc(#loc413)
+      %p_124 = math.exp2 %p_123 : tensor<512x64xf32> loc(#loc414)
+      %l_i_125 = arith.mulf %l_i_79, %alpha_121 : tensor<512xf32> loc(#loc415)
+      %l_i_126 = "tt.reduce"(%p_124) <{axis = 1 : i32}> ({
+      ^bb0(%l_i_152: f32 loc(callsite(#loc1 at #loc416)), %l_i_153: f32 loc(callsite(#loc1 at #loc416))):
+        %l_i_154 = arith.addf %l_i_152, %l_i_153 : f32 loc(#loc520)
+        tt.reduce.return %l_i_154 : f32 loc(#loc466)
+      }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc466)
+      %l_i_127 = arith.addf %l_i_125, %l_i_126 : tensor<512xf32> loc(#loc417)
+      %acc_128 = tt.expand_dims %alpha_121 {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc418)
+      %acc_129 = tt.broadcast %acc_128 : tensor<512x1xf32> -> tensor<512x128xf32> loc(#loc419)
+      %acc_130 = arith.mulf %acc_78, %acc_129 : tensor<512x128xf32> loc(#loc419)
+      %ptr_131 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc468)
+      %ptr_132 = tt.addptr %ptr_131, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc468)
+      %ptr_133 = tt.broadcast %ptr_132 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc469)
+      %ptr_134 = tt.addptr %ptr_133, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc469)
+      %v = tt.load %ptr_134, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc470)
+      %acc_135 = arith.truncf %p_124 : tensor<512x64xf32> to tensor<512x64xbf16> loc(#loc421)
+      %acc_136 = tt.dot %acc_135, %v, %acc_130, inputPrecision = tf32 : tensor<512x64xbf16> * tensor<64x128xbf16> -> tensor<512x128xf32> loc(#loc422)
+      %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc471)
+      %cur_block = tt.addptr %arg_KV_IDX, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc472)
+      %cur_block_137 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc473)
+      %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc474)
+      %next_block_138 = arith.cmpi slt, %next_block, %kv_num_blocks : i32 loc(#loc475)
+      %next_block_139 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc476)
+      %next_block_140 = tt.load %next_block_139, %next_block_138 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc477)
+      %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc478)
+      %needs_jump_141 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc479)
+      %needs_jump_142 = arith.cmpi eq, %needs_jump_141, %c0_i32 : i32 loc(#loc480)
+      %jump_to_block = arith.subi %next_block_140, %cur_block_137 : i32 loc(#loc481)
+      %jump_to_block_143 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc482)
+      %jump_to_block_144 = arith.subi %jump_to_block_143, %c64_i32 : i32 loc(#loc483)
+      %offset = arith.extui %needs_jump_142 : i1 to i32 loc(#loc484)
+      %offset_145 = arith.muli %jump_to_block_144, %offset : i32 loc(#loc484)
+      %offset_146 = arith.subi %c1_i32, %offset : i32 loc(#loc485)
+      %offset_147 = arith.muli %offset_146, %c64_i32 : i32 loc(#loc486)
+      %offset_148 = arith.addi %offset_145, %offset_147 : i32 loc(#loc487)
+      %offs_n_149 = tt.splat %offset_148 : i32 -> tensor<1x64xi32> loc(#loc424)
+      %offs_n_150 = arith.addi %offs_n_81, %offs_n_149 : tensor<1x64xi32> loc(#loc424)
+      %kv_offset_151 = arith.addi %kv_offset_82, %offset_148 : i32 loc(#loc425)
+      scf.yield %acc_136, %l_i_127, %m_ij_120, %offs_n_150, %kv_offset_151 : tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32 loc(#loc333)
+    } loc(#loc524)
+    %kv_num_blocks_53 = tt.load %arg_FULL_KV_NUM_BLKS : !tt.ptr<i32> loc(#loc334)
+    %block_n_start_54 = arith.subi %c31_i32, %off_t : i32 loc(#loc335)
+    %block_n_start_55 = arith.muli %block_n_start_54, %3 : i32 loc(#loc336)
+    %block_n_end_56 = arith.addi %block_n_start_55, %3 : i32 loc(#loc337)
+    %off_n_block_in_sparse_57 = arith.remsi %block_n_start_55, %c2_i32 : i32 loc(#loc338)
+    %off_n_58 = tt.load %arg_FULL_KV_IDX : !tt.ptr<i32> loc(#loc339)
+    %off_n_59 = arith.muli %off_n_58, %c128_i32 : i32 loc(#loc340)
+    %off_n_60 = arith.muli %off_n_block_in_sparse_57, %c64_i32 : i32 loc(#loc341)
+    %off_n_61 = arith.addi %off_n_59, %off_n_60 : i32 loc(#loc342)
+    %block_n_last_valid_62 = arith.muli %kv_num_blocks_53, %c2_i32 : i32 loc(#loc343)
+    %block_n_last_valid_63 = arith.minsi %block_n_last_valid_62, %block_n_last_valid_49 : i32 loc(#loc344)
+    %offs_n_64 = tt.splat %off_n_61 : i32 -> tensor<64xi32> loc(#loc345)
+    %offs_n_65 = arith.addi %offs_n, %offs_n_64 : tensor<64xi32> loc(#loc345)
+    %8 = tt.expand_dims %offs_n_65 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc164)
+    %9 = arith.cmpi sle, %block_n_end_56, %block_n_last_valid_63 : i32 loc(#loc165)
+    %10 = arith.select %9, %block_n_end_56, %block_n_last_valid_63 : i32 loc(#loc166)
+    %kv_offset_66:5 = scf.for %start_n = %block_n_start_55 to %10 step %c1_i32 iter_args(%acc_78 = %kv_offset#0, %l_i_79 = %kv_offset#1, %m_i_80 = %kv_offset#2, %offs_n_81 = %8, %kv_offset_82 = %c0_i32) -> (tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32)  : i32 {
+      %kv_base_offset = arith.addi %off_n_61, %kv_offset_82 : i32 loc(#loc426)
+      %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc427)
+      %offs_n_load_83 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc427)
+      %ptr = tt.expand_dims %offs_n_load_83 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc488)
+      %ptr_84 = arith.muli %ptr, %cst_0 : tensor<64x1xi32> loc(#loc489)
+      %ptr_85 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc490)
+      %ptr_86 = tt.addptr %ptr_85, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc490)
+      %ptr_87 = tt.broadcast %ptr_86 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc491)
+      %ptr_88 = tt.broadcast %offs_m : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc491)
+      %ptr_89 = tt.addptr %ptr_87, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc491)
+      %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc492)
+      %k_90 = arith.cmpi slt, %ptr, %k : tensor<64x1xi32> loc(#loc492)
+      %k_91 = tt.broadcast %k_90 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc493)
+      %k_92 = tt.load %ptr_89, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc493)
+      %k_93 = tt.trans %k_92 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc429)
+      %qk = tt.dot %q_43, %k_93, %cst_13, inputPrecision = tf32 : tensor<512x128xbf16> * tensor<128x64xbf16> -> tensor<512x64xf32> loc(#loc430)
+      %qk_94 = arith.mulf %qk, %cst_12 : tensor<512x64xf32> loc(#loc431)
+      %post_mod_scores = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc432)
+      %post_mod_scores_95 = arith.cmpi slt, %offs_n_81, %post_mod_scores : tensor<1x64xi32> loc(#loc432)
+      %post_mod_scores_96 = tt.broadcast %post_mod_scores_95 : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc433)
+      %post_mod_scores_97 = arith.select %post_mod_scores_96, %qk_94, %cst_11 : tensor<512x64xi1>, tensor<512x64xf32> loc(#loc433)
+      %post_mod_scores_98 = arith.mulf %post_mod_scores_97, %cst_5 : tensor<512x64xf32> loc(#loc434)
+      %m_ij = "tt.reduce"(%post_mod_scores_98) <{axis = 1 : i32}> ({
+      ^bb0(%m_ij_131: f32 loc(callsite(#loc1 at #loc435)), %m_ij_132: f32 loc(callsite(#loc1 at #loc435))):
+        %m_ij_133 = arith.maxnumf %m_ij_131, %m_ij_132 : f32 loc(#loc521)
+        tt.reduce.return %m_ij_133 : f32 loc(#loc494)
+      }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc494)
+      %m_ij_99 = arith.maxnumf %m_i_80, %m_ij : tensor<512xf32> loc(#loc436)
+      %masked_out_rows = arith.cmpf oeq, %m_ij_99, %cst_4 : tensor<512xf32> loc(#loc437)
+      %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_99 : tensor<512xi1>, tensor<512xf32> loc(#loc438)
+      %alpha = arith.subf %m_i_80, %m_ij_masked : tensor<512xf32> loc(#loc439)
+      %alpha_100 = math.exp2 %alpha : tensor<512xf32> loc(#loc440)
+      %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc441)
+      %p_101 = tt.broadcast %p : tensor<512x1xf32> -> tensor<512x64xf32> loc(#loc442)
+      %p_102 = arith.subf %post_mod_scores_98, %p_101 : tensor<512x64xf32> loc(#loc442)
+      %p_103 = math.exp2 %p_102 : tensor<512x64xf32> loc(#loc443)
+      %l_i_104 = arith.mulf %l_i_79, %alpha_100 : tensor<512xf32> loc(#loc444)
+      %l_i_105 = "tt.reduce"(%p_103) <{axis = 1 : i32}> ({
+      ^bb0(%l_i_131: f32 loc(callsite(#loc1 at #loc445)), %l_i_132: f32 loc(callsite(#loc1 at #loc445))):
+        %l_i_133 = arith.addf %l_i_131, %l_i_132 : f32 loc(#loc522)
+        tt.reduce.return %l_i_133 : f32 loc(#loc496)
+      }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc496)
+      %l_i_106 = arith.addf %l_i_104, %l_i_105 : tensor<512xf32> loc(#loc446)
+      %acc_107 = tt.expand_dims %alpha_100 {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc447)
+      %acc_108 = tt.broadcast %acc_107 : tensor<512x1xf32> -> tensor<512x128xf32> loc(#loc448)
+      %acc_109 = arith.mulf %acc_78, %acc_108 : tensor<512x128xf32> loc(#loc448)
+      %ptr_110 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc498)
+      %ptr_111 = tt.addptr %ptr_110, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc498)
+      %ptr_112 = tt.broadcast %ptr_111 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc499)
+      %ptr_113 = tt.addptr %ptr_112, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc499)
+      %v = tt.load %ptr_113, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc500)
+      %acc_114 = arith.truncf %p_103 : tensor<512x64xf32> to tensor<512x64xbf16> loc(#loc450)
+      %acc_115 = tt.dot %acc_114, %v, %acc_109, inputPrecision = tf32 : tensor<512x64xbf16> * tensor<64x128xbf16> -> tensor<512x128xf32> loc(#loc451)
+      %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc501)
+      %cur_block = tt.addptr %arg_FULL_KV_IDX, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc502)
+      %cur_block_116 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc503)
+      %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc504)
+      %next_block_117 = arith.cmpi slt, %next_block, %kv_num_blocks_53 : i32 loc(#loc505)
+      %next_block_118 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc506)
+      %next_block_119 = tt.load %next_block_118, %next_block_117 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc507)
+      %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc508)
+      %needs_jump_120 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc509)
+      %needs_jump_121 = arith.cmpi eq, %needs_jump_120, %c0_i32 : i32 loc(#loc510)
+      %jump_to_block = arith.subi %next_block_119, %cur_block_116 : i32 loc(#loc511)
+      %jump_to_block_122 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc512)
+      %jump_to_block_123 = arith.subi %jump_to_block_122, %c64_i32 : i32 loc(#loc513)
+      %offset = arith.extui %needs_jump_121 : i1 to i32 loc(#loc514)
+      %offset_124 = arith.muli %jump_to_block_123, %offset : i32 loc(#loc514)
+      %offset_125 = arith.subi %c1_i32, %offset : i32 loc(#loc515)
+      %offset_126 = arith.muli %offset_125, %c64_i32 : i32 loc(#loc516)
+      %offset_127 = arith.addi %offset_124, %offset_126 : i32 loc(#loc517)
+      %offs_n_128 = tt.splat %offset_127 : i32 -> tensor<1x64xi32> loc(#loc453)
+      %offs_n_129 = arith.addi %offs_n_81, %offs_n_128 : tensor<1x64xi32> loc(#loc453)
+      %kv_offset_130 = arith.addi %kv_offset_82, %offset_127 : i32 loc(#loc454)
+      scf.yield %acc_115, %l_i_106, %m_ij_99, %offs_n_129, %kv_offset_130 : tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32 loc(#loc347)
+    } loc(#loc525)
+    %m_offset = arith.muli %off_t, %2 : i32 loc(#loc348)
+    %m_offset_67 = arith.muli %off_z_20, %1 : i32 loc(#loc349)
+    %m_offset_68 = arith.addi %m_offset, %m_offset_67 : i32 loc(#loc350)
+    %M_block_ptr = tt.addptr %arg_M, %m_offset_68 : !tt.ptr<f32>, i32 loc(#loc351)
+    %M_block_ptr_69 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc352)
+    %M_block_ptr_70 = arith.extsi %ks0 : i32 to i64 loc(#loc353)
+    %M_block_ptr_71 = arith.extsi %M_block_ptr_69 : i32 to i64 loc(#loc353)
+    %L_block_ptr = tt.addptr %arg_L, %m_offset_68 : !tt.ptr<f32>, i32 loc(#loc354)
+    %m_i = tt.reshape %kv_offset_66#2 : tensor<512xf32> -> tensor<4x128xf32> loc(#loc355)
+    %l_i = tt.reshape %kv_offset_66#1 : tensor<512xf32> -> tensor<4x128xf32> loc(#loc356)
+    %11 = tt.splat %M_block_ptr : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>> loc(#loc177)
+    %12 = tt.splat %M_block_ptr_71 : i64 -> tensor<4xi64> loc(#loc177)
+    %13 = arith.extsi %off_g : tensor<4xi32> to tensor<4xi64> loc(#loc177)
+    %14 = arith.addi %12, %13 : tensor<4xi64> loc(#loc177)
+    %15 = tt.expand_dims %14 {axis = 1 : i32} : tensor<4xi64> -> tensor<4x1xi64> loc(#loc177)
+    %16 = tt.splat %M_block_ptr_70 : i64 -> tensor<4x1xi64> loc(#loc177)
+    %17 = arith.muli %15, %16 : tensor<4x1xi64> loc(#loc177)
+    %18 = tt.broadcast %17 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc177)
+    %19 = arith.extsi %off_m : tensor<128xi32> to tensor<128xi64> loc(#loc177)
+    %20 = tt.expand_dims %19 {axis = 0 : i32} : tensor<128xi64> -> tensor<1x128xi64> loc(#loc177)
+    %21 = tt.broadcast %20 : tensor<1x128xi64> -> tensor<4x128xi64> loc(#loc177)
+    %22 = arith.addi %18, %21 : tensor<4x128xi64> loc(#loc177)
+    %23 = tt.addptr %11, %22 : tensor<4x128x!tt.ptr<f32>>, tensor<4x128xi64> loc(#loc177)
+    %24 = arith.cmpi sge, %20, %cst : tensor<1x128xi64> loc(#loc177)
+    %25 = tt.splat %M_block_ptr_70 : i64 -> tensor<1x128xi64> loc(#loc177)
+    %26 = arith.cmpi slt, %20, %25 : tensor<1x128xi64> loc(#loc177)
+    %27 = arith.andi %24, %26 : tensor<1x128xi1> loc(#loc177)
+    %28 = tt.broadcast %27 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc177)
+    tt.store %23, %m_i, %28 : tensor<4x128x!tt.ptr<f32>> loc(#loc177)
+    %29 = tt.splat %L_block_ptr : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>> loc(#loc178)
+    %30 = tt.addptr %29, %22 : tensor<4x128x!tt.ptr<f32>>, tensor<4x128xi64> loc(#loc178)
+    tt.store %30, %l_i, %28 : tensor<4x128x!tt.ptr<f32>> loc(#loc178)
+    %idx_hq = tt.splat %M_block_ptr_69 : i32 -> tensor<4x1x1xi32> loc(#loc357)
+    %idx_hq_72 = arith.addi %idx_hq, %q_range_26 : tensor<4x1x1xi32> loc(#loc357)
+    %mask_73 = arith.cmpi slt, %q_range_33, %mask : tensor<1x1x128xi32> loc(#loc207)
+    %mask_74 = tt.broadcast %q_37 : tensor<1x128x1xi1> -> tensor<1x128x128xi1> loc(#loc358)
+    %mask_75 = tt.broadcast %mask_73 : tensor<1x1x128xi1> -> tensor<1x128x128xi1> loc(#loc358)
+    %mask_76 = arith.andi %mask_74, %mask_75 : tensor<1x128x128xi1> loc(#loc358)
+    %acc_77 = tt.reshape %kv_offset_66#0 : tensor<512x128xf32> -> tensor<4x128x128xf32> loc(#loc359)
+    %31 = arith.muli %q_range_28, %cst_15 : tensor<1x128x1xi32> loc(#loc182)
+    %32 = tt.broadcast %q_range_33 : tensor<1x1x128xi32> -> tensor<1x128x128xi32> loc(#loc183)
+    %33 = tt.broadcast %31 : tensor<1x128x1xi32> -> tensor<1x128x128xi32> loc(#loc183)
+    %34 = arith.addi %32, %33 : tensor<1x128x128xi32> loc(#loc183)
+    %35 = arith.muli %idx_hq_72, %cst_16 : tensor<4x1x1xi32> loc(#loc184)
+    %36 = tt.splat %ks0 : i32 -> tensor<4x1x1xi32> loc(#loc185)
+    %37 = arith.muli %35, %36 : tensor<4x1x1xi32> loc(#loc185)
+    %38 = tt.broadcast %34 : tensor<1x128x128xi32> -> tensor<4x128x128xi32> loc(#loc186)
+    %39 = tt.broadcast %37 : tensor<4x1x1xi32> -> tensor<4x128x128xi32> loc(#loc186)
+    %40 = arith.addi %38, %39 : tensor<4x128x128xi32> loc(#loc186)
+    %41 = arith.muli %off_t, %c4096_i32 : i32 loc(#loc187)
+    %42 = arith.muli %41, %ks0 : i32 loc(#loc188)
+    %43 = tt.splat %42 : i32 -> tensor<4x128x128xi32> loc(#loc189)
+    %44 = arith.addi %40, %43 : tensor<4x128x128xi32> loc(#loc189)
+    %45 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<4x128x128x!tt.ptr<f32>> loc(#loc190)
+    %46 = tt.addptr %45, %44 : tensor<4x128x128x!tt.ptr<f32>>, tensor<4x128x128xi32> loc(#loc190)
+    %47 = tt.broadcast %mask_76 : tensor<1x128x128xi1> -> tensor<4x128x128xi1> loc(#loc191)
+    tt.store %46, %acc_77, %47 : tensor<4x128x128x!tt.ptr<f32>> loc(#loc191)
+    tt.return loc(#loc192)
+  } loc(#loc)
+} loc(#loc)
+#loc3 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":132:19)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":276:38)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":155:61)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":135:21)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":95:10)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":86:60)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":86:65)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":89:54)
+#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":89:62)
+#loc13 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22)
+#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":104:33)
+#loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28)
+#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":105:34)
+#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":105:45)
+#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":106:49)
+#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":108:26)
+#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":108:48)
+#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":110:49)
+#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":111:26)
+#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":113:23)
+#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":113:45)
+#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":113:35)
+#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":114:47)
+#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":117:12)
+#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":118:12)
+#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":137:25)
+#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":140:25)
+#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":141:44)
+#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":141:54)
+#loc33 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":74:27)
+#loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":141:22)
+#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":152:28)
+#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":153:34)
+#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":155:32)
+#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":155:26)
+#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":155:67)
+#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":155:49)
+#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":155:103)
+#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":155:84)
+#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":162:72)
+#loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":162:24)
+#loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":162:35)
+#loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":162:20)
+#loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":166:22)
+#loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":173:28)
+#loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":176:44)
+#loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":177:20)
+#loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":177:48)
+#loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":177:95)
+#loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":177:71)
+#loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":181:52)
+#loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":181:99)
+#loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":181:109)
+#loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":181:72)
+#loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":183:26)
+#loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":183:37)
+#loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":194:40)
+#loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":194:57)
+#loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":198:53)
+#loc63 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":198:38)
+#loc64 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":532:40)
+#loc65 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":388:32)
+#loc67 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":392:35)
+#loc68 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":328:27)
+#loc69 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":393:107)
+#loc70 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":328:38)
+#loc71 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":328:20)
+#loc72 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":328:49)
+#loc73 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":336:52)
+#loc74 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":336:23)
+#loc75 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":395:17)
+#loc76 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":397:19)
+#loc77 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":399:14)
+#loc78 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":301:21)
+#loc79 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":404:36)
+#loc80 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":405:36)
+#loc81 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":413:44)
+#loc82 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":413:69)
+#loc83 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":418:22)
+#loc84 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":420:23)
+#loc85 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":421:22)
+#loc86 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":422:23)
+#loc87 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":423:22)
+#loc88 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":424:22)
+#loc89 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":425:24)
+#loc90 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":426:23)
+#loc91 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":429:70)
+#loc92 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":429:79)
+#loc93 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":429:91)
+#loc94 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":429:99)
+#loc95 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":429:102)
+#loc96 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":429:119)
+#loc97 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":431:70)
+#loc98 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":431:79)
+#loc99 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":431:91)
+#loc100 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":431:99)
+#loc101 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":431:102)
+#loc102 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":431:119)
+#loc103 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":432:25)
+#loc104 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":433:24)
+#loc105 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":434:23)
+#loc106 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":435:23)
+#loc107 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":440:73)
+#loc108 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":442:69)
+#loc109 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":445:27)
+#loc110 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40)
+#loc112 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27)
+#loc113 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":449:27)
+#loc114 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":451:35)
+#loc115 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":452:51)
+#loc116 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":456:31)
+#loc117 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":456:25)
+#loc118 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":457:51)
+#loc119 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":457:39)
+#loc120 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":457:21)
+#loc121 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":462:16)
+#loc122 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
+#loc124 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
+#loc125 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":462:24)
+#loc126 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":464:22)
+#loc127 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":464:16)
+#loc128 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":467:107)
+#loc129 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":468:22)
+#loc130 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":468:44)
+#loc131 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":291:33)
+#loc132 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":575:63)
+#loc133 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":292:38)
+#loc134 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":292:24)
+#loc135 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":293:109)
+#loc136 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":293:113)
+#loc137 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":293:55)
+#loc138 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":293:25)
+#loc139 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":294:30)
+#loc140 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":294:35)
+#loc141 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":294:60)
+#loc142 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":295:34)
+#loc143 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":295:48)
+#loc144 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":295:63)
+#loc145 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":296:29)
+#loc146 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":296:47)
+#loc147 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":296:61)
+#loc148 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":296:42)
+#loc149 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":578:26)
+#loc150 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":579:21)
+#loc151 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":579:8)
+#loc152 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":210:32)
+#loc153 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":212:44)
+#loc154 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":212:49)
+#loc155 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":213:38)
+#loc156 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":215:48)
+#loc157 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":216:24)
+#loc158 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":216:52)
+#loc159 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":216:99)
+#loc160 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":216:75)
+#loc161 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":219:56)
+#loc162 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":219:76)
+#loc163 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":221:41)
+#loc164 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":229:61)
+#loc165 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":233:57)
+#loc166 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":233:42)
+#loc168 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":239:23)
+#loc169 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":239:43)
+#loc170 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":239:35)
+#loc171 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":243:17)
+#loc172 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":246:25)
+#loc173 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":248:8)
+#loc174 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":251:17)
+#loc175 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":260:25)
+#loc176 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":261:25)
+#loc177 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":266:30)
+#loc178 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":267:30)
+#loc179 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":272:25)
+#loc180 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":276:30)
+#loc181 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":277:41)
+#loc182 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":279:53)
+#loc183 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":279:49)
+#loc184 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":279:65)
+#loc185 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":279:72)
+#loc186 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":279:61)
+#loc187 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":279:83)
+#loc188 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":279:89)
+#loc189 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":279:78)
+#loc190 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":279:25)
+#loc191 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":279:112)
+#loc192 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/7w/c7wuvfpczzsfkwsy5ra2l5uzjhob3kfxg67qg3lxntlztgutuwnk.py":279:4)
+#loc205 = loc(callsite(#loc1 at #loc2))
+#loc206 = loc("acc"(#loc4))
+#loc207 = loc("mask"(#loc5))
+#loc208 = loc("q_range"(#loc6))
+#loc209 = loc("HKV"(#loc8))
+#loc210 = loc("TILE_KV_OG"(#loc14))
+#loc211 = loc("TILE_KV"(#loc16))
+#loc212 = loc("TILE_KV"(#loc17))
+#loc213 = loc("off_z"(#loc19))
+#loc214 = loc("off_z"(#loc20))
+#loc215 = loc("off_hkv"(#loc21))
+#loc216 = loc("off_t"(#loc22))
+#loc217 = loc("q_offset"(#loc23))
+#loc218 = loc("q_offset"(#loc24))
+#loc219 = loc("q_offset"(#loc25))
+#loc220 = loc("k_offset"(#loc26))
+#loc221 = loc("K"(#loc27))
+#loc222 = loc("V"(#loc28))
+#loc223 = loc("off_g"(#loc29))
+#loc224 = loc("off_m"(#loc30))
+#loc225 = loc("offs_m"(#loc31))
+#loc226 = loc("offs_m"(#loc32))
+#loc227 = loc("offs_m"(#loc34))
+#loc228 = loc("block_n_start"(#loc35))
+#loc229 = loc("block_n_end"(#loc36))
+#loc230 = loc("q_range"(#loc37))
+#loc231 = loc("q_range"(#loc38))
+#loc232 = loc("q_range"(#loc39))
+#loc233 = loc("q_range"(#loc40))
+#loc234 = loc("q_range"(#loc41))
+#loc235 = loc("q_range"(#loc42))
+#loc236 = loc("q"(#loc43))
+#loc237 = loc("q"(#loc44))
+#loc238 = loc("q"(#loc45))
+#loc239 = loc("q"(#loc46))
+#loc240 = loc("q"(#loc47))
+#loc241 = loc("kv_num_blocks"(#loc48))
+#loc242 = loc("off_n_block_in_sparse"(#loc49))
+#loc243 = loc("off_n"(#loc50))
+#loc244 = loc("off_n"(#loc51))
+#loc245 = loc("off_n"(#loc52))
+#loc246 = loc("off_n"(#loc53))
+#loc247 = loc("block_n_last_valid"(#loc54))
+#loc248 = loc("block_n_last_valid"(#loc55))
+#loc249 = loc("block_n_last_valid"(#loc56))
+#loc250 = loc("block_n_last_valid"(#loc57))
+#loc251 = loc("offs_n"(#loc58))
+#loc252 = loc("offs_n"(#loc59))
+#loc253 = loc("acc"(#loc64))
+#loc254 = loc("kv_base_offset"(#loc65))
+#loc256 = loc("offs_n_load"(#loc67))
+#loc257 = loc("ptr"(#loc68))
+#loc258 = loc("k"(#loc69))
+#loc259 = loc("ptr"(#loc70))
+#loc260 = loc("ptr"(#loc71))
+#loc261 = loc("ptr"(#loc72))
+#loc262 = loc("k"(#loc75))
+#loc263 = loc("qk"(#loc76))
+#loc264 = loc("qk"(#loc77))
+#loc265 = loc("m"(#loc79))
+#loc266 = loc("n"(#loc80))
+#loc267 = loc("post_mod_scores"(#loc81))
+#loc268 = loc("post_mod_scores"(#loc82))
+#loc269 = loc("tmp3"(#loc83))
+#loc270 = loc("tmp5"(#loc84))
+#loc271 = loc("tmp6"(#loc85))
+#loc272 = loc("tmp7"(#loc86))
+#loc273 = loc("tmp8"(#loc87))
+#loc274 = loc("tmp9"(#loc88))
+#loc275 = loc("tmp10"(#loc89))
+#loc276 = loc("tmp11"(#loc90))
+#loc277 = loc("tmp14"(#loc91))
+#loc278 = loc("tmp14"(#loc92))
+#loc279 = loc("tmp14"(#loc93))
+#loc280 = loc("tmp14"(#loc94))
+#loc281 = loc("tmp14"(#loc95))
+#loc282 = loc("tmp14"(#loc96))
+#loc283 = loc("tmp16"(#loc97))
+#loc284 = loc("tmp16"(#loc98))
+#loc285 = loc("tmp16"(#loc99))
+#loc286 = loc("tmp16"(#loc100))
+#loc287 = loc("tmp16"(#loc101))
+#loc288 = loc("tmp16"(#loc102))
+#loc289 = loc("tmp17"(#loc103))
+#loc290 = loc("tmp18"(#loc104))
+#loc291 = loc("tmp19"(#loc105))
+#loc292 = loc("tmp20"(#loc106))
+#loc293 = loc("mask_mod_output"(#loc107))
+#loc294 = loc("post_mod_scores"(#loc108))
+#loc295 = loc("post_mod_scores"(#loc109))
+#loc297 = loc("m_ij"(#loc113))
+#loc298 = loc("masked_out_rows"(#loc114))
+#loc299 = loc("m_ij_masked"(#loc115))
+#loc300 = loc("alpha"(#loc116))
+#loc301 = loc("alpha"(#loc117))
+#loc302 = loc("p"(#loc118))
+#loc303 = loc("p"(#loc119))
+#loc304 = loc("p"(#loc120))
+#loc305 = loc("l_i"(#loc121))
+#loc307 = loc("l_i"(#loc125))
+#loc308 = loc("acc"(#loc126))
+#loc309 = loc("acc"(#loc127))
+#loc310 = loc("v"(#loc128))
+#loc311 = loc("acc"(#loc129))
+#loc312 = loc("acc"(#loc130))
+#loc313 = loc("cur_block_idx"(#loc131))
+#loc314 = loc("offset"(#loc132))
+#loc315 = loc("cur_block"(#loc133))
+#loc316 = loc("cur_block"(#loc134))
+#loc317 = loc("next_block"(#loc135))
+#loc318 = loc("next_block"(#loc136))
+#loc319 = loc("next_block"(#loc137))
+#loc320 = loc("next_block"(#loc138))
+#loc321 = loc("needs_jump"(#loc139))
+#loc322 = loc("needs_jump"(#loc140))
+#loc323 = loc("needs_jump"(#loc141))
+#loc324 = loc("jump_to_block"(#loc142))
+#loc325 = loc("jump_to_block"(#loc143))
+#loc326 = loc("jump_to_block"(#loc144))
+#loc327 = loc("offset"(#loc145))
+#loc328 = loc("offset"(#loc146))
+#loc329 = loc("offset"(#loc147))
+#loc330 = loc("offset"(#loc148))
+#loc331 = loc("offs_n"(#loc149))
+#loc332 = loc("kv_offset"(#loc150))
+#loc333 = loc(callsite(#loc151 at #loc2))
+#loc334 = loc("kv_num_blocks"(#loc152))
+#loc335 = loc("block_n_start"(#loc153))
+#loc336 = loc("block_n_start"(#loc154))
+#loc337 = loc("block_n_end"(#loc155))
+#loc338 = loc("off_n_block_in_sparse"(#loc156))
+#loc339 = loc("off_n"(#loc157))
+#loc340 = loc("off_n"(#loc158))
+#loc341 = loc("off_n"(#loc159))
+#loc342 = loc("off_n"(#loc160))
+#loc343 = loc("block_n_last_valid"(#loc161))
+#loc344 = loc("block_n_last_valid"(#loc162))
+#loc345 = loc("offs_n"(#loc163))
+#loc347 = loc(callsite(#loc151 at #loc167))
+#loc348 = loc("m_offset"(#loc168))
+#loc349 = loc("m_offset"(#loc169))
+#loc350 = loc("m_offset"(#loc170))
+#loc351 = loc("M_block_ptr"(#loc171))
+#loc352 = loc("M_block_ptr"(#loc172))
+#loc353 = loc("M_block_ptr"(#loc173))
+#loc354 = loc("L_block_ptr"(#loc174))
+#loc355 = loc("m_i"(#loc175))
+#loc356 = loc("l_i"(#loc176))
+#loc357 = loc("idx_hq"(#loc179))
+#loc358 = loc("mask"(#loc180))
+#loc359 = loc("acc"(#loc181))
+#loc360 = loc(callsite(#loc3 at #loc206))
+#loc361 = loc(callsite(#loc13 at #loc210))
+#loc362 = loc(callsite(#loc15 at #loc210))
+#loc363 = loc(callsite(#loc13 at #loc211))
+#loc364 = loc(callsite(#loc15 at #loc211))
+#loc365 = loc(callsite(#loc33 at #loc227))
+#loc366 = loc(callsite(#loc13 at #loc248))
+#loc367 = loc(callsite(#loc15 at #loc248))
+#loc368 = loc("l_i"(#loc253))
+#loc369 = loc(callsite(#loc254 at #loc255))
+#loc370 = loc(callsite(#loc256 at #loc255))
+#loc371 = loc(callsite(#loc258 at #loc255))
+#loc372 = loc(callsite(#loc262 at #loc255))
+#loc373 = loc(callsite(#loc263 at #loc255))
+#loc374 = loc(callsite(#loc264 at #loc255))
+#loc375 = loc(callsite(#loc265 at #loc255))
+#loc376 = loc(callsite(#loc266 at #loc255))
+#loc377 = loc(callsite(#loc267 at #loc255))
+#loc378 = loc(callsite(#loc268 at #loc255))
+#loc379 = loc(callsite(#loc269 at #loc255))
+#loc380 = loc(callsite(#loc270 at #loc255))
+#loc381 = loc(callsite(#loc271 at #loc255))
+#loc382 = loc(callsite(#loc272 at #loc255))
+#loc383 = loc(callsite(#loc273 at #loc255))
+#loc384 = loc(callsite(#loc274 at #loc255))
+#loc385 = loc(callsite(#loc275 at #loc255))
+#loc386 = loc(callsite(#loc276 at #loc255))
+#loc387 = loc(callsite(#loc277 at #loc255))
+#loc388 = loc(callsite(#loc278 at #loc255))
+#loc389 = loc(callsite(#loc279 at #loc255))
+#loc390 = loc(callsite(#loc280 at #loc255))
+#loc391 = loc(callsite(#loc281 at #loc255))
+#loc392 = loc(callsite(#loc282 at #loc255))
+#loc393 = loc(callsite(#loc283 at #loc255))
+#loc394 = loc(callsite(#loc284 at #loc255))
+#loc395 = loc(callsite(#loc285 at #loc255))
+#loc396 = loc(callsite(#loc286 at #loc255))
+#loc397 = loc(callsite(#loc287 at #loc255))
+#loc398 = loc(callsite(#loc288 at #loc255))
+#loc399 = loc(callsite(#loc289 at #loc255))
+#loc400 = loc(callsite(#loc290 at #loc255))
+#loc401 = loc(callsite(#loc291 at #loc255))
+#loc402 = loc(callsite(#loc292 at #loc255))
+#loc403 = loc(callsite(#loc293 at #loc255))
+#loc404 = loc(callsite(#loc294 at #loc255))
+#loc405 = loc(callsite(#loc295 at #loc255))
+#loc407 = loc(callsite(#loc297 at #loc255))
+#loc408 = loc(callsite(#loc298 at #loc255))
+#loc409 = loc(callsite(#loc299 at #loc255))
+#loc410 = loc(callsite(#loc300 at #loc255))
+#loc411 = loc(callsite(#loc301 at #loc255))
+#loc412 = loc(callsite(#loc302 at #loc255))
+#loc413 = loc(callsite(#loc303 at #loc255))
+#loc414 = loc(callsite(#loc304 at #loc255))
+#loc415 = loc(callsite(#loc305 at #loc255))
+#loc417 = loc(callsite(#loc307 at #loc255))
+#loc418 = loc(callsite(#loc308 at #loc255))
+#loc419 = loc(callsite(#loc309 at #loc255))
+#loc420 = loc(callsite(#loc310 at #loc255))
+#loc421 = loc(callsite(#loc311 at #loc255))
+#loc422 = loc(callsite(#loc312 at #loc255))
+#loc423 = loc(callsite(#loc314 at #loc2))
+#loc424 = loc(callsite(#loc331 at #loc2))
+#loc425 = loc(callsite(#loc332 at #loc2))
+#loc426 = loc(callsite(#loc254 at #loc346))
+#loc427 = loc(callsite(#loc256 at #loc346))
+#loc428 = loc(callsite(#loc258 at #loc346))
+#loc429 = loc(callsite(#loc262 at #loc346))
+#loc430 = loc(callsite(#loc263 at #loc346))
+#loc431 = loc(callsite(#loc264 at #loc346))
+#loc432 = loc(callsite(#loc267 at #loc346))
+#loc433 = loc(callsite(#loc268 at #loc346))
+#loc434 = loc(callsite(#loc295 at #loc346))
+#loc436 = loc(callsite(#loc297 at #loc346))
+#loc437 = loc(callsite(#loc298 at #loc346))
+#loc438 = loc(callsite(#loc299 at #loc346))
+#loc439 = loc(callsite(#loc300 at #loc346))
+#loc440 = loc(callsite(#loc301 at #loc346))
+#loc441 = loc(callsite(#loc302 at #loc346))
+#loc442 = loc(callsite(#loc303 at #loc346))
+#loc443 = loc(callsite(#loc304 at #loc346))
+#loc444 = loc(callsite(#loc305 at #loc346))
+#loc446 = loc(callsite(#loc307 at #loc346))
+#loc447 = loc(callsite(#loc308 at #loc346))
+#loc448 = loc(callsite(#loc309 at #loc346))
+#loc449 = loc(callsite(#loc310 at #loc346))
+#loc450 = loc(callsite(#loc311 at #loc346))
+#loc451 = loc(callsite(#loc312 at #loc346))
+#loc452 = loc(callsite(#loc314 at #loc167))
+#loc453 = loc(callsite(#loc331 at #loc167))
+#loc454 = loc(callsite(#loc332 at #loc167))
+#loc455 = loc("m_i"(#loc368))
+#loc456 = loc(callsite(#loc257 at #loc371))
+#loc457 = loc(callsite(#loc259 at #loc371))
+#loc458 = loc(callsite(#loc260 at #loc371))
+#loc459 = loc(callsite(#loc261 at #loc371))
+#loc460 = loc(callsite(#loc73 at #loc371))
+#loc461 = loc(callsite(#loc74 at #loc371))
+#loc462 = loc(callsite(#loc78 at #loc375))
+#loc463 = loc(callsite(#loc78 at #loc376))
+#loc464 = loc(callsite(#loc110 at #loc406))
+#loc466 = loc(callsite(#loc122 at #loc416))
+#loc468 = loc(callsite(#loc260 at #loc420))
+#loc469 = loc(callsite(#loc261 at #loc420))
+#loc470 = loc(callsite(#loc74 at #loc420))
+#loc471 = loc(callsite(#loc313 at #loc423))
+#loc472 = loc(callsite(#loc315 at #loc423))
+#loc473 = loc(callsite(#loc316 at #loc423))
+#loc474 = loc(callsite(#loc317 at #loc423))
+#loc475 = loc(callsite(#loc318 at #loc423))
+#loc476 = loc(callsite(#loc319 at #loc423))
+#loc477 = loc(callsite(#loc320 at #loc423))
+#loc478 = loc(callsite(#loc321 at #loc423))
+#loc479 = loc(callsite(#loc322 at #loc423))
+#loc480 = loc(callsite(#loc323 at #loc423))
+#loc481 = loc(callsite(#loc324 at #loc423))
+#loc482 = loc(callsite(#loc325 at #loc423))
+#loc483 = loc(callsite(#loc326 at #loc423))
+#loc484 = loc(callsite(#loc327 at #loc423))
+#loc485 = loc(callsite(#loc328 at #loc423))
+#loc486 = loc(callsite(#loc329 at #loc423))
+#loc487 = loc(callsite(#loc330 at #loc423))
+#loc488 = loc(callsite(#loc257 at #loc428))
+#loc489 = loc(callsite(#loc259 at #loc428))
+#loc490 = loc(callsite(#loc260 at #loc428))
+#loc491 = loc(callsite(#loc261 at #loc428))
+#loc492 = loc(callsite(#loc73 at #loc428))
+#loc493 = loc(callsite(#loc74 at #loc428))
+#loc494 = loc(callsite(#loc110 at #loc435))
+#loc496 = loc(callsite(#loc122 at #loc445))
+#loc498 = loc(callsite(#loc260 at #loc449))
+#loc499 = loc(callsite(#loc261 at #loc449))
+#loc500 = loc(callsite(#loc74 at #loc449))
+#loc501 = loc(callsite(#loc313 at #loc452))
+#loc502 = loc(callsite(#loc315 at #loc452))
+#loc503 = loc(callsite(#loc316 at #loc452))
+#loc504 = loc(callsite(#loc317 at #loc452))
+#loc505 = loc(callsite(#loc318 at #loc452))
+#loc506 = loc(callsite(#loc319 at #loc452))
+#loc507 = loc(callsite(#loc320 at #loc452))
+#loc508 = loc(callsite(#loc321 at #loc452))
+#loc509 = loc(callsite(#loc322 at #loc452))
+#loc510 = loc(callsite(#loc323 at #loc452))
+#loc511 = loc(callsite(#loc324 at #loc452))
+#loc512 = loc(callsite(#loc325 at #loc452))
+#loc513 = loc(callsite(#loc326 at #loc452))
+#loc514 = loc(callsite(#loc327 at #loc452))
+#loc515 = loc(callsite(#loc328 at #loc452))
+#loc516 = loc(callsite(#loc329 at #loc452))
+#loc517 = loc(callsite(#loc330 at #loc452))
+#loc518 = loc("offs_n"(#loc455))
+#loc519 = loc(callsite(#loc112 at #loc464))
+#loc520 = loc(callsite(#loc124 at #loc466))
+#loc521 = loc(callsite(#loc112 at #loc494))
+#loc522 = loc(callsite(#loc124 at #loc496))
+#loc523 = loc("kv_offset"(#loc518))
+#loc524 = loc(callsite(#loc523 at #loc2))
+#loc525 = loc(callsite(#loc523 at #loc167))

progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/__grp__triton_poi_fused_mul_1.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"child_paths": {"triton_poi_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.source", "triton_poi_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.ttir", "triton_poi_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.ttgir", "triton_poi_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.llir", "triton_poi_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.ptx", "triton_poi_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.cubin", "triton_poi_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.json"}}

progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.cubin ADDED Viewed

Binary file (5.78 kB). View file

progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"hash": "ffd9fb24da74ed60a968acef8976d2fea617718c7565c58b1705308246ad732c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_1"}

progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.llir ADDED Viewed

	@@ -0,0 +1,58 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 8, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 1, !dbg !9
+  %10 = and i32 %9, 254, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = sext i32 %11 to i64, !dbg !11
+  %13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !11
+  %14 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l"(ptr addrspace(1) %13) #2, !dbg !12
+  %15 = extractvalue { i32, i32 } %14, 0, !dbg !12
+  %16 = extractvalue { i32, i32 } %14, 1, !dbg !12
+  %17 = bitcast i32 %15 to float, !dbg !12
+  %18 = bitcast i32 %16 to float, !dbg !12
+  %19 = fmul float %17, 0x3FE62E4300000000, !dbg !13
+  %20 = fmul float %18, 0x3FE62E4300000000, !dbg !13
+  %21 = getelementptr float, ptr addrspace(1) %1, i64 %12, !dbg !14
+  %22 = bitcast float %19 to i32, !dbg !15
+  %23 = bitcast float %20 to i32, !dbg !15
+  tail call void asm sideeffect "st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l"(i32 %22, i32 %23, ptr addrspace(1) %21) #2, !dbg !15
+  ret void, !dbg !16
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_1", linkageName: "triton_poi_fused_mul_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 30, scope: !4)
+!12 = !DILocation(line: 24, column: 35, scope: !4)
+!13 = !DILocation(line: 26, column: 18, scope: !4)
+!14 = !DILocation(line: 27, column: 25, scope: !4)
+!15 = !DILocation(line: 27, column: 36, scope: !4)
+!16 = !DILocation(line: 27, column: 4, scope: !4)

progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.ptx ADDED Viewed

	@@ -0,0 +1,221 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.7
+.target sm_90a
+.address_size 64
+	// .globl	triton_poi_fused_mul_1  // -- Begin function triton_poi_fused_mul_1
+                                        // @triton_poi_fused_mul_1
+.visible .entry triton_poi_fused_mul_1(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_1,
+	.param .u32 triton_poi_fused_mul_1_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_4
+)
+.reqntid 128
+{
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:18:0
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_poi_fused_mul_1_param_0];
+	ld.param.b64 	%rd4, [triton_poi_fused_mul_1_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:20:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 20 33                         // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:20:33
+	shl.b32 	%r6, %r5, 8;
+	.loc	1 21 36                         // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:21:36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 1;
+	and.b32 	%r9, %r8, 254;
+	.loc	1 21 23                         // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:21:23
+	or.b32 	%r10, %r9, %r6;
+	.loc	1 24 30                         // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:24:30
+	mul.wide.s32 	%rd5, %r10, 4;
+	add.s64 	%rd1, %rd3, %rd5;
+	.loc	1 24 35                         // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:24:35
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	mov.u32 %r2, 0x0;
+	ld.global.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 26 18                         // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:26:18
+	mul.f32 	%r3, %r1, 0f3F317218;
+	mul.f32 	%r4, %r2, 0f3F317218;
+	.loc	1 27 25                         // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:27:25
+	add.s64 	%rd2, %rd4, %rd5;
+	.loc	1 27 36                         // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:27:36
+	// begin inline asm
+	st.global.v2.b32 [ %rd2 + 0 ], { %r3, %r4 };
+	// end inline asm
+	.loc	1 27 4                          // cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py:27:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 139                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x84 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 106
+.b8 99
+.b8 121
+.b8 100
+.b8 102
+.b8 97
+.b8 53
+.b8 54
+.b8 98
+.b8 114
+.b8 113
+.b8 115
+.b8 115
+.b8 115
+.b8 102
+.b8 117
+.b8 119
+.b8 54
+.b8 110
+.b8 121
+.b8 55
+.b8 110
+.b8 53
+.b8 51
+.b8 104
+.b8 100
+.b8 122
+.b8 117
+.b8 104
+.b8 53
+.b8 99
+.b8 108
+.b8 52
+.b8 105
+.b8 50
+.b8 103
+.b8 112
+.b8 100
+.b8 122
+.b8 114
+.b8 122
+.b8 122
+.b8 54
+.b8 107
+.b8 54
+.b8 108
+.b8 101
+.b8 105
+.b8 105
+.b8 100
+.b8 102
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 119
+.b8 111
+.b8 114
+.b8 107
+.b8 115
+.b8 112
+.b8 97
+.b8 99
+.b8 101
+.b8 47
+.b8 104
+.b8 97
+.b8 110
+.b8 114
+.b8 117
+.b8 105
+.b8 47
+.b8 106
+.b8 117
+.b8 110
+.b8 113
+.b8 117
+.b8 97
+.b8 110
+.b8 47
+.b8 83
+.b8 112
+.b8 101
+.b8 99
+.b8 70
+.b8 111
+.b8 114
+.b8 103
+.b8 101
+.b8 47
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 107
+.b8 101
+.b8 114
+.b8 110
+.b8 101
+.b8 108
+.b8 115
+.b8 47
+.b8 106
+.b8 99
+.b8 0
+	}
+	.section	.debug_macinfo	{	}

progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.source ADDED Viewed

	@@ -0,0 +1,51 @@

+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":18:0)
+#loc14 = loc("in_ptr0"(#loc))
+#loc15 = loc("out_ptr0"(#loc))
+#loc16 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 24576 : i32 loc(#loc17)
+    %xoffset = tt.get_program_id x : i32 loc(#loc18)
+    %xoffset_1 = arith.constant 256 : i32 loc(#loc19)
+    %xoffset_2 = arith.constant 256 : i32 loc(#loc19)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc19)
+    %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc20)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<256xi32> loc(#loc21)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<256xi32> loc(#loc21)
+    %xmask = arith.constant true loc(#loc22)
+    %xmask_6 = arith.constant dense<true> : tensor<256xi1> loc(#loc22)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc23)
+    %tmp0_7 = tt.addptr %tmp0, %xindex_5 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc23)
+    %tmp0_8 = tt.load %tmp0_7 : tensor<256x!tt.ptr<f32>> loc(#loc24)
+    %tmp1 = arith.constant 0.693147182 : f32 loc(#loc25)
+    %tmp2 = arith.constant dense<0.693147182> : tensor<256xf32> loc(#loc26)
+    %tmp2_9 = arith.mulf %tmp0_8, %tmp2 : tensor<256xf32> loc(#loc26)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc11)
+    %1 = tt.addptr %0, %xindex_5 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc11)
+    tt.store %1, %tmp2_9 : tensor<256x!tt.ptr<f32>> loc(#loc12)
+    tt.return loc(#loc13)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":19:13)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":20:28)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":20:33)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":21:36)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":21:23)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":22:36)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":24:30)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":24:35)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":25:11)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":26:18)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":27:25)
+#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":27:36)
+#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":27:4)
+#loc17 = loc("xnumel"(#loc1))
+#loc18 = loc("xoffset"(#loc2))
+#loc19 = loc("xoffset"(#loc3))
+#loc20 = loc("xindex"(#loc4))
+#loc21 = loc("xindex"(#loc5))
+#loc22 = loc("xmask"(#loc6))
+#loc23 = loc("tmp0"(#loc7))
+#loc24 = loc("tmp0"(#loc8))
+#loc25 = loc("tmp1"(#loc9))
+#loc26 = loc("tmp2"(#loc10))

progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.ttgir ADDED Viewed

	@@ -0,0 +1,42 @@

+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":18:0)
+#loc12 = loc("in_ptr0"(#loc))
+#loc13 = loc("out_ptr0"(#loc))
+#loc14 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.693147182> : tensor<256xf32, #blocked> loc(#loc1)
+    %c256_i32 = arith.constant 256 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc15)
+    %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc16)
+    %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> loc(#loc17)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32, #blocked> loc(#loc18)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32, #blocked> loc(#loc18)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>, #blocked> loc(#loc19)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<256x!tt.ptr<f32>, #blocked>, tensor<256xi32, #blocked> loc(#loc19)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<256x!tt.ptr<f32>, #blocked> loc(#loc20)
+    %tmp2 = arith.mulf %tmp0_4, %cst : tensor<256xf32, #blocked> loc(#loc21)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>, #blocked> loc(#loc9)
+    %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr<f32>, #blocked>, tensor<256xi32, #blocked> loc(#loc9)
+    tt.store %1, %tmp2 : tensor<256x!tt.ptr<f32>, #blocked> loc(#loc10)
+    tt.return loc(#loc11)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":20:28)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":20:33)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":21:36)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":21:23)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":24:30)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":24:35)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":26:18)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":27:25)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":27:36)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":27:4)
+#loc15 = loc("xoffset"(#loc2))
+#loc16 = loc("xoffset"(#loc3))
+#loc17 = loc("xindex"(#loc4))
+#loc18 = loc("xindex"(#loc5))
+#loc19 = loc("tmp0"(#loc6))
+#loc20 = loc("tmp0"(#loc7))
+#loc21 = loc("tmp2"(#loc8))

progress/github/SpecForge/cache/compiled_kernels/triton/1/77M7WJG2OTWWBKLIVTXYS5WS72TBO4MMOVS4LCYXAUYIERVNOMWA/triton_poi_fused_mul_1.ttir ADDED Viewed

	@@ -0,0 +1,41 @@

+#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":18:0)
+#loc12 = loc("in_ptr0"(#loc))
+#loc13 = loc("out_ptr0"(#loc))
+#loc14 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp2 = arith.constant dense<0.693147182> : tensor<256xf32> loc(#loc15)
+    %c256_i32 = arith.constant 256 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc16)
+    %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc17)
+    %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc18)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32> loc(#loc19)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32> loc(#loc19)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc20)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc20)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<256x!tt.ptr<f32>> loc(#loc21)
+    %tmp2_5 = arith.mulf %tmp0_4, %tmp2 : tensor<256xf32> loc(#loc15)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc9)
+    %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc9)
+    tt.store %1, %tmp2_5 : tensor<256x!tt.ptr<f32>> loc(#loc10)
+    tt.return loc(#loc11)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":26:18)
+#loc2 = loc(unknown)
+#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":20:28)
+#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":20:33)
+#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":21:36)
+#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":21:23)
+#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":24:30)
+#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":24:35)
+#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":27:25)
+#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":27:36)
+#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/jc/cjcydfa56brqsssfuw6ny7n53hdzuh5cl4i2gpdzrzz6k6leiidf.py":27:4)
+#loc15 = loc("tmp2"(#loc1))
+#loc16 = loc("xoffset"(#loc3))
+#loc17 = loc("xoffset"(#loc4))
+#loc18 = loc("xindex"(#loc5))
+#loc19 = loc("xindex"(#loc6))
+#loc20 = loc("tmp0"(#loc7))
+#loc21 = loc("tmp0"(#loc8))

progress/github/SpecForge/cache/compiled_kernels/triton/1/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/__grp__triton_red_fused_mul_0.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"child_paths": {"triton_red_fused_mul_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.source", "triton_red_fused_mul_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ttir", "triton_red_fused_mul_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ttgir", "triton_red_fused_mul_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.llir", "triton_red_fused_mul_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ptx", "triton_red_fused_mul_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.cubin", "triton_red_fused_mul_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.json"}}

progress/github/SpecForge/cache/compiled_kernels/triton/1/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.cubin ADDED Viewed

Binary file (16.3 kB). View file