Lekr0 commited on
Commit
32a1ae1
·
verified ·
1 Parent(s): 0941def

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json +1 -0
  2. SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin +0 -0
  3. SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json +1 -0
  4. SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir +204 -0
  5. SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx +525 -0
  6. SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source +193 -0
  7. SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir +147 -0
  8. SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir +152 -0
  9. SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/__grp__triton_tem_fused_zeros_1.json +1 -0
  10. SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.json +1 -0
  11. SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.llir +0 -0
  12. SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ptx +0 -0
  13. SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.source +0 -0
  14. SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttgir +0 -0
  15. SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttir +0 -0
  16. SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json +1 -0
  17. SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin +0 -0
  18. SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json +1 -0
  19. SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir +333 -0
  20. SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx +809 -0
  21. SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source +418 -0
  22. SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir +280 -0
  23. SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir +283 -0
  24. SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json +1 -0
  25. SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin +0 -0
  26. SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json +1 -0
  27. SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir +781 -0
  28. SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx +1410 -0
  29. SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source +0 -0
  30. SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir +841 -0
  31. SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir +799 -0
  32. SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/__grp__triton_red_fused_argmax_1.json +1 -0
  33. SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.cubin +0 -0
  34. SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.json +1 -0
  35. SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.llir +611 -0
  36. SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ptx +1196 -0
  37. SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.source +315 -0
  38. SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttgir +203 -0
  39. SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttir +204 -0
  40. SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/__grp__triton_poi_fused_new_zeros_0.json +1 -0
  41. SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.cubin +0 -0
  42. SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.json +1 -0
  43. SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.llir +52 -0
  44. SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ptx +214 -0
  45. SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.source +38 -0
  46. SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttgir +35 -0
  47. SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttir +34 -0
  48. SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/__grp__triton_red_fused_zeros_0.json +1 -0
  49. SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.cubin +0 -0
  50. SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.json +1 -0
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_red_fused__to_copy_clone_slice_sum_transpose_5.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json"}}
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin ADDED
Binary file (17.2 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "d1c2e6527ce27b628a96c1a250025b39aad19d679fce295e820390aa7ae64b66", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_clone_slice_sum_transpose_5"}
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @global_smem = external addrspace(3) global [0 x i8], align 16
6
+
7
+ ; Function Attrs: nounwind
8
+ define ptx_kernel void @triton_red_fused__to_copy_clone_slice_sum_transpose_5(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 {
9
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
10
+ %10 = shl i32 %9, 5, !dbg !8
11
+ %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
12
+ %12 = and i32 %11, 31, !dbg !9
13
+ %13 = or disjoint i32 %10, %12, !dbg !10
14
+ %14 = icmp slt i32 %13, %4, !dbg !11
15
+ %15 = lshr i32 %11, 5, !dbg !12
16
+ %16 = and i32 %15, 3, !dbg !12
17
+ %17 = sext i32 %13 to i64, !dbg !13
18
+ %.frozen = freeze i64 %2, !dbg !14
19
+ %18 = sdiv i64 %17, %.frozen, !dbg !14
20
+ %19 = mul i64 %18, %.frozen, !dbg !13
21
+ %.decomposed = sub i64 %17, %19, !dbg !13
22
+ %20 = icmp sgt i32 %5, 0, !dbg !15
23
+ br i1 %20, label %.lr.ph, label %._crit_edge, !dbg !15
24
+
25
+ .lr.ph: ; preds = %8
26
+ %21 = mul i64 %3, %2, !dbg !16
27
+ %22 = mul i64 %21, %18, !dbg !17
28
+ %23 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed
29
+ %invariant.gep = getelementptr i32, ptr addrspace(1) %23, i64 %22, !dbg !15
30
+ %24 = insertelement <4 x i1> poison, i1 %14, i64 0, !dbg !18
31
+ %25 = shufflevector <4 x i1> %24, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !18
32
+ %26 = insertelement <4 x i32> poison, i32 %5, i64 0, !dbg !19
33
+ %27 = shufflevector <4 x i32> %26, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !19
34
+ br label %28, !dbg !15
35
+
36
+ 28: ; preds = %.lr.ph, %28
37
+ %29 = phi i32 [ 0, %.lr.ph ], [ %68, %28 ]
38
+ %30 = phi <4 x i64> [ zeroinitializer, %.lr.ph ], [ %67, %28 ]
39
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20
40
+ %32 = or disjoint i32 %29, %16, !dbg !21
41
+ %33 = or disjoint i32 %32, 4, !dbg !21
42
+ %34 = or disjoint i32 %32, 8, !dbg !21
43
+ %35 = or disjoint i32 %32, 12, !dbg !21
44
+ %36 = insertelement <4 x i32> poison, i32 %32, i64 0, !dbg !19
45
+ %37 = insertelement <4 x i32> %36, i32 %33, i64 1, !dbg !19
46
+ %38 = insertelement <4 x i32> %37, i32 %34, i64 2, !dbg !19
47
+ %39 = insertelement <4 x i32> %38, i32 %35, i64 3, !dbg !19
48
+ %40 = icmp slt <4 x i32> %39, %27, !dbg !19
49
+ %41 = sext i32 %32 to i64, !dbg !22
50
+ %42 = sext i32 %33 to i64, !dbg !22
51
+ %43 = sext i32 %34 to i64, !dbg !22
52
+ %44 = sext i32 %35 to i64, !dbg !22
53
+ %45 = mul i64 %2, %41, !dbg !22
54
+ %46 = mul i64 %2, %42, !dbg !22
55
+ %47 = mul i64 %2, %43, !dbg !22
56
+ %48 = mul i64 %2, %44, !dbg !22
57
+ %gep = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %45, !dbg !23
58
+ %gep4 = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %46, !dbg !23
59
+ %gep6 = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %47, !dbg !23
60
+ %gep8 = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %48, !dbg !23
61
+ %49 = and <4 x i1> %25, %40, !dbg !18
62
+ %50 = extractelement <4 x i1> %49, i64 0, !dbg !20
63
+ %51 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep, i64 %31, i1 %50) #5, !dbg !20
64
+ %52 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20
65
+ %53 = extractelement <4 x i1> %49, i64 1, !dbg !20
66
+ %54 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep4, i64 %52, i1 %53) #5, !dbg !20
67
+ %55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20
68
+ %56 = extractelement <4 x i1> %49, i64 2, !dbg !20
69
+ %57 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep6, i64 %55, i1 %56) #5, !dbg !20
70
+ %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20
71
+ %59 = extractelement <4 x i1> %49, i64 3, !dbg !20
72
+ %60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep8, i64 %58, i1 %59) #5, !dbg !20
73
+ %61 = insertelement <4 x i32> poison, i32 %51, i64 0, !dbg !24
74
+ %62 = insertelement <4 x i32> %61, i32 %54, i64 1, !dbg !24
75
+ %63 = insertelement <4 x i32> %62, i32 %57, i64 2, !dbg !24
76
+ %64 = insertelement <4 x i32> %63, i32 %60, i64 3, !dbg !24
77
+ %65 = sext <4 x i32> %64 to <4 x i64>, !dbg !24
78
+ %66 = select <4 x i1> %49, <4 x i64> %65, <4 x i64> zeroinitializer, !dbg !25
79
+ %67 = add <4 x i64> %66, %30, !dbg !25
80
+ %68 = add i32 %29, 16, !dbg !15
81
+ %69 = icmp slt i32 %68, %5, !dbg !15
82
+ br i1 %69, label %28, label %._crit_edge.loopexit, !dbg !15
83
+
84
+ ._crit_edge.loopexit: ; preds = %28
85
+ %70 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %67), !dbg !26
86
+ br label %._crit_edge, !dbg !26
87
+
88
+ ._crit_edge: ; preds = %._crit_edge.loopexit, %8
89
+ %71 = phi i64 [ 0, %8 ], [ %70, %._crit_edge.loopexit ], !dbg !26
90
+ %.idx = shl nuw nsw i32 %12, 5, !dbg !30
91
+ %72 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !30
92
+ %73 = getelementptr i64, ptr addrspace(3) %72, i32 %16, !dbg !30
93
+ %74 = insertelement <1 x i64> poison, i64 %71, i64 0, !dbg !30
94
+ tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %73, <1 x i64> %74, i1 true) #5, !dbg !30
95
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
96
+ %75 = icmp samesign ult i32 %11, 128, !dbg !30
97
+ %76 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %11, !dbg !30
98
+ %77 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %76, i1 %75) #5, !dbg !30
99
+ %extelt.offset = lshr i64 %77, 32, !dbg !30
100
+ %78 = trunc nuw i64 %extelt.offset to i32, !dbg !30
101
+ %79 = trunc i64 %77 to i32, !dbg !30
102
+ %80 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %79, i32 2, i32 31), !dbg !30
103
+ %81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 2, i32 31), !dbg !30
104
+ %82 = insertelement <2 x i32> poison, i32 %80, i64 0, !dbg !30
105
+ %83 = insertelement <2 x i32> %82, i32 %81, i64 1, !dbg !30
106
+ %84 = bitcast <2 x i32> %83 to i64, !dbg !30
107
+ %85 = add i64 %77, %84, !dbg !26
108
+ %extelt.offset2 = lshr i64 %85, 32, !dbg !30
109
+ %86 = trunc nuw i64 %extelt.offset2 to i32, !dbg !30
110
+ %87 = trunc i64 %85 to i32, !dbg !30
111
+ %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !30
112
+ %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !30
113
+ %90 = insertelement <2 x i32> poison, i32 %88, i64 0, !dbg !30
114
+ %91 = insertelement <2 x i32> %90, i32 %89, i64 1, !dbg !30
115
+ %92 = bitcast <2 x i32> %91 to i64, !dbg !30
116
+ %93 = add i64 %85, %92, !dbg !26
117
+ %94 = and i32 %11, 899, !dbg !30
118
+ %95 = icmp eq i32 %94, 0, !dbg !30
119
+ %96 = insertelement <1 x i64> poison, i64 %93, i64 0, !dbg !30
120
+ tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %76, <1 x i64> %96, i1 %95) #5, !dbg !30
121
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
122
+ %97 = load i64, ptr addrspace(3) %72, align 16, !dbg !30
123
+ %98 = trunc i64 %97 to i32, !dbg !31
124
+ %99 = icmp slt i64 %2, 2, !dbg !32
125
+ %100 = icmp sgt i64 %2, 1, !dbg !33
126
+ %101 = select i1 %100, i64 %2, i64 0, !dbg !34
127
+ %102 = zext i1 %99 to i64, !dbg !35
128
+ %103 = add i64 %101, %102, !dbg !36
129
+ %104 = mul i64 %18, %103, !dbg !37
130
+ %105 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !38
131
+ %106 = getelementptr i32, ptr addrspace(1) %105, i64 %104, !dbg !38
132
+ %107 = and i32 %11, 96, !dbg !39
133
+ %108 = icmp eq i32 %107, 0, !dbg !39
134
+ %109 = and i1 %108, %14, !dbg !39
135
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %98, ptr addrspace(1) %106, i1 %109) #5, !dbg !39
136
+ ret void, !dbg !40
137
+ }
138
+
139
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
140
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
141
+
142
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
143
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
144
+
145
+ ; Function Attrs: convergent nocallback nounwind
146
+ declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
147
+
148
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
149
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
150
+
151
+ ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
152
+ declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4
153
+
154
+ attributes #0 = { nounwind "nvvm.reqntid"="128" }
155
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
156
+ attributes #2 = { convergent nocallback nounwind }
157
+ attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
158
+ attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
159
+ attributes #5 = { nounwind }
160
+
161
+ !llvm.dbg.cu = !{!0}
162
+ !llvm.module.flags = !{!2, !3}
163
+
164
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
165
+ !1 = !DIFile(filename: "cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt")
166
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
167
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
168
+ !4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", linkageName: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
169
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
170
+ !6 = !{}
171
+ !7 = !DILocation(line: 21, column: 28, scope: !4)
172
+ !8 = !DILocation(line: 21, column: 33, scope: !4)
173
+ !9 = !DILocation(line: 22, column: 44, scope: !4)
174
+ !10 = !DILocation(line: 22, column: 23, scope: !4)
175
+ !11 = !DILocation(line: 23, column: 21, scope: !4)
176
+ !12 = !DILocation(line: 24, column: 37, scope: !4)
177
+ !13 = !DILocation(line: 26, column: 19, scope: !4)
178
+ !14 = !DILocation(line: 27, column: 19, scope: !4)
179
+ !15 = !DILocation(line: 30, column: 40, scope: !4)
180
+ !16 = !DILocation(line: 36, column: 54, scope: !4)
181
+ !17 = !DILocation(line: 36, column: 58, scope: !4)
182
+ !18 = !DILocation(line: 36, column: 73, scope: !4)
183
+ !19 = !DILocation(line: 32, column: 29, scope: !4)
184
+ !20 = !DILocation(line: 36, column: 63, scope: !4)
185
+ !21 = !DILocation(line: 31, column: 31, scope: !4)
186
+ !22 = !DILocation(line: 36, column: 43, scope: !4)
187
+ !23 = !DILocation(line: 36, column: 34, scope: !4)
188
+ !24 = !DILocation(line: 37, column: 23, scope: !4)
189
+ !25 = !DILocation(line: 40, column: 48, scope: !4)
190
+ !26 = !DILocation(line: 261, column: 15, scope: !27, inlinedAt: !29)
191
+ !27 = distinct !DILexicalBlockFile(scope: !4, file: !28, discriminator: 0)
192
+ !28 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language")
193
+ !29 = !DILocation(line: 41, column: 25, scope: !4)
194
+ !30 = !DILocation(line: 291, column: 36, scope: !27, inlinedAt: !29)
195
+ !31 = !DILocation(line: 42, column: 19, scope: !4)
196
+ !32 = !DILocation(line: 43, column: 49, scope: !4)
197
+ !33 = !DILocation(line: 43, column: 75, scope: !4)
198
+ !34 = !DILocation(line: 43, column: 66, scope: !4)
199
+ !35 = !DILocation(line: 43, scope: !4)
200
+ !36 = !DILocation(line: 43, column: 57, scope: !4)
201
+ !37 = !DILocation(line: 43, column: 34, scope: !4)
202
+ !38 = !DILocation(line: 43, column: 25, scope: !4)
203
+ !39 = !DILocation(line: 43, column: 88, scope: !4)
204
+ !40 = !DILocation(line: 43, column: 4, scope: !4)
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_red_fused__to_copy_clone_slice_sum_transpose_5 // -- Begin function triton_red_fused__to_copy_clone_slice_sum_transpose_5
10
+ .extern .shared .align 16 .b8 global_smem[];
11
+ // @triton_red_fused__to_copy_clone_slice_sum_transpose_5
12
+ .visible .entry triton_red_fused__to_copy_clone_slice_sum_transpose_5(
13
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0,
14
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1,
15
+ .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2,
16
+ .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3,
17
+ .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4,
18
+ .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5,
19
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_6,
20
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_7
21
+ )
22
+ .reqntid 128
23
+ {
24
+ .reg .pred %p<24>;
25
+ .reg .b32 %r<51>;
26
+ .reg .b64 %rd<97>;
27
+ .loc 1 18 0 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:18:0
28
+ $L__func_begin0:
29
+ .loc 1 18 0 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:18:0
30
+
31
+ // %bb.0:
32
+ ld.param.b32 %r11, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5];
33
+ ld.param.b64 %rd20, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2];
34
+ $L__tmp0:
35
+ .loc 1 21 28 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:21:28
36
+ mov.u32 %r12, %ctaid.x;
37
+ .loc 1 21 33 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:21:33
38
+ shl.b32 %r13, %r12, 5;
39
+ .loc 1 22 44 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:22:44
40
+ mov.u32 %r1, %tid.x;
41
+ and.b32 %r2, %r1, 31;
42
+ .loc 1 22 23 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:22:23
43
+ or.b32 %r14, %r13, %r2;
44
+ .loc 1 26 19 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:26:19
45
+ cvt.s64.s32 %rd1, %r14;
46
+ .loc 1 27 19 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:27:19
47
+ or.b64 %rd23, %rd1, %rd20;
48
+ and.b64 %rd24, %rd23, -4294967296;
49
+ setp.ne.b64 %p5, %rd24, 0;
50
+ cvt.u32.u64 %r49, %rd1;
51
+ @%p5 bra $L__BB0_2;
52
+ bra.uni $L__BB0_1;
53
+ $L__BB0_2:
54
+ div.s64 %rd91, %rd1, %rd20;
55
+ bra.uni $L__BB0_3;
56
+ $L__BB0_1:
57
+ cvt.u32.u64 %r15, %rd20;
58
+ div.u32 %r17, %r49, %r15;
59
+ cvt.u64.u32 %rd91, %r17;
60
+ $L__BB0_3:
61
+ .loc 1 0 19 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:0:19
62
+ ld.param.b32 %r10, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4];
63
+ ld.param.b64 %rd19, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1];
64
+ bfe.u32 %r3, %r1, 5, 2;
65
+ .loc 1 26 19 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:26:19
66
+ mul.lo.s64 %rd26, %rd91, %rd20;
67
+ sub.s64 %rd6, %rd1, %rd26;
68
+ .loc 1 30 40 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:30:40
69
+ setp.lt.s32 %p6, %r11, 1;
70
+ mov.b64 %rd96, 0;
71
+ shl.b64 %rd90, %rd6, 2;
72
+ @%p6 bra $L__BB0_7;
73
+ // %bb.4: // %.lr.ph
74
+ .loc 1 0 40 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:0:40
75
+ ld.param.b64 %rd21, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3];
76
+ ld.param.b64 %rd18, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0];
77
+ .loc 1 23 21 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:23:21
78
+ setp.lt.s32 %p1, %r49, %r10;
79
+ .loc 1 36 54 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:54
80
+ mul.lo.s64 %rd31, %rd21, %rd20;
81
+ .loc 1 36 58 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:58
82
+ mul.lo.s64 %rd32, %rd31, %rd91;
83
+ add.s64 %rd34, %rd18, %rd90;
84
+ .loc 1 30 40 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:30:40
85
+ shl.b64 %rd35, %rd32, 2;
86
+ add.s64 %rd7, %rd34, %rd35;
87
+ mov.b64 %rd92, 0;
88
+ mov.b32 %r50, 0;
89
+ mov.b64 %rd93, %rd92;
90
+ mov.b64 %rd94, %rd92;
91
+ mov.b64 %rd95, %rd92;
92
+ $L__BB0_5: // =>This Inner Loop Header: Depth=1
93
+ .loc 1 36 63 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:63
94
+ // begin inline asm
95
+ mov.u64 %rd36, 0x0;
96
+ createpolicy.fractional.L2::evict_last.b64 %rd36, 1.0;
97
+ // end inline asm
98
+ .loc 1 31 31 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:31:31
99
+ add.s32 %r24, %r3, %r50;
100
+ add.s32 %r25, %r24, 4;
101
+ add.s32 %r26, %r24, 8;
102
+ .loc 1 32 29 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:32:29
103
+ add.s32 %r27, %r24, 12;
104
+ setp.lt.s32 %p11, %r24, %r11;
105
+ setp.lt.s32 %p12, %r25, %r11;
106
+ setp.lt.s32 %p13, %r26, %r11;
107
+ setp.lt.s32 %p14, %r27, %r11;
108
+ .loc 1 36 43 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:43
109
+ cvt.s64.s32 %rd48, %r24;
110
+ cvt.s64.s32 %rd49, %r25;
111
+ cvt.s64.s32 %rd50, %r26;
112
+ cvt.s64.s32 %rd51, %r27;
113
+ mul.lo.s64 %rd52, %rd20, %rd48;
114
+ mul.lo.s64 %rd53, %rd20, %rd49;
115
+ mul.lo.s64 %rd54, %rd20, %rd50;
116
+ mul.lo.s64 %rd55, %rd20, %rd51;
117
+ .loc 1 36 34 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:34
118
+ shl.b64 %rd56, %rd52, 2;
119
+ add.s64 %rd37, %rd7, %rd56;
120
+ shl.b64 %rd57, %rd53, 2;
121
+ add.s64 %rd40, %rd7, %rd57;
122
+ shl.b64 %rd58, %rd54, 2;
123
+ add.s64 %rd43, %rd7, %rd58;
124
+ shl.b64 %rd59, %rd55, 2;
125
+ add.s64 %rd46, %rd7, %rd59;
126
+ .loc 1 36 73 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:73
127
+ and.pred %p10, %p1, %p14;
128
+ and.pred %p9, %p1, %p13;
129
+ and.pred %p8, %p1, %p12;
130
+ and.pred %p7, %p1, %p11;
131
+ .loc 1 36 63 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:63
132
+ // begin inline asm
133
+ mov.u32 %r20, 0x0;
134
+ @%p7 ld.global.L1::evict_last.L2::cache_hint.b32 { %r20 }, [ %rd37 + 0 ], %rd36;
135
+ // end inline asm
136
+ // begin inline asm
137
+ mov.u64 %rd39, 0x0;
138
+ createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0;
139
+ // end inline asm
140
+ // begin inline asm
141
+ mov.u32 %r21, 0x0;
142
+ @%p8 ld.global.L1::evict_last.L2::cache_hint.b32 { %r21 }, [ %rd40 + 0 ], %rd39;
143
+ // end inline asm
144
+ // begin inline asm
145
+ mov.u64 %rd42, 0x0;
146
+ createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0;
147
+ // end inline asm
148
+ // begin inline asm
149
+ mov.u32 %r22, 0x0;
150
+ @%p9 ld.global.L1::evict_last.L2::cache_hint.b32 { %r22 }, [ %rd43 + 0 ], %rd42;
151
+ // end inline asm
152
+ // begin inline asm
153
+ mov.u64 %rd45, 0x0;
154
+ createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0;
155
+ // end inline asm
156
+ // begin inline asm
157
+ mov.u32 %r23, 0x0;
158
+ @%p10 ld.global.L1::evict_last.L2::cache_hint.b32 { %r23 }, [ %rd46 + 0 ], %rd45;
159
+ // end inline asm
160
+ .loc 1 37 23 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:37:23
161
+ cvt.s64.s32 %rd60, %r20;
162
+ cvt.s64.s32 %rd61, %r21;
163
+ cvt.s64.s32 %rd62, %r22;
164
+ cvt.s64.s32 %rd63, %r23;
165
+ .loc 1 40 48 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:40:48
166
+ selp.b64 %rd64, %rd63, 0, %p10;
167
+ selp.b64 %rd65, %rd62, 0, %p9;
168
+ selp.b64 %rd66, %rd61, 0, %p8;
169
+ selp.b64 %rd67, %rd60, 0, %p7;
170
+ add.s64 %rd92, %rd67, %rd92;
171
+ add.s64 %rd93, %rd66, %rd93;
172
+ add.s64 %rd94, %rd65, %rd94;
173
+ add.s64 %rd95, %rd64, %rd95;
174
+ .loc 1 30 40 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:30:40
175
+ add.s32 %r50, %r50, 16;
176
+ setp.lt.s32 %p15, %r50, %r11;
177
+ @%p15 bra $L__BB0_5;
178
+ // %bb.6: // %._crit_edge.loopexit
179
+ $L__tmp1:
180
+ .loc 2 261 15 // standard.py:261:15 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
181
+ add.s64 %rd68, %rd92, %rd94;
182
+ add.s64 %rd69, %rd93, %rd95;
183
+ add.s64 %rd96, %rd68, %rd69;
184
+ $L__tmp2:
185
+ $L__BB0_7: // %._crit_edge
186
+ .loc 1 23 21 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:23:21
187
+ setp.lt.s32 %p20, %r49, %r10;
188
+ $L__tmp3:
189
+ .loc 2 291 36 // standard.py:291:36 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
190
+ shl.b32 %r33, %r2, 5;
191
+ mov.b32 %r34, global_smem;
192
+ add.s32 %r35, %r34, %r33;
193
+ shl.b32 %r36, %r3, 3;
194
+ add.s32 %r28, %r35, %r36;
195
+ mov.pred %p16, -1;
196
+ // begin inline asm
197
+ @%p16 st.shared.b64 [ %r28 + 0 ], %rd96;
198
+ // end inline asm
199
+ bar.sync 0;
200
+ setp.lt.u32 %p17, %r1, 128;
201
+ shl.b32 %r37, %r1, 3;
202
+ add.s32 %r29, %r34, %r37;
203
+ // begin inline asm
204
+ @%p17 ld.shared.b64 %rd71, [ %r29 + 0 ];
205
+ // end inline asm
206
+ mov.b64 {_, %r38}, %rd71;
207
+ cvt.u32.u64 %r39, %rd71;
208
+ shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
209
+ shfl.sync.bfly.b32 %r41, %r38, 2, 31, -1;
210
+ cvt.u64.u32 %rd74, %r40;
211
+ cvt.u64.u32 %rd75, %r41;
212
+ shl.b64 %rd76, %rd75, 32;
213
+ or.b64 %rd77, %rd74, %rd76;
214
+ .loc 2 261 15 // standard.py:261:15 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
215
+ add.s64 %rd78, %rd71, %rd77;
216
+ .loc 2 291 36 // standard.py:291:36 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
217
+ mov.b64 {_, %r42}, %rd78;
218
+ cvt.u32.u64 %r43, %rd78;
219
+ shfl.sync.bfly.b32 %r44, %r43, 1, 31, -1;
220
+ shfl.sync.bfly.b32 %r45, %r42, 1, 31, -1;
221
+ cvt.u64.u32 %rd79, %r44;
222
+ cvt.u64.u32 %rd80, %r45;
223
+ shl.b64 %rd81, %rd80, 32;
224
+ or.b64 %rd82, %rd79, %rd81;
225
+ .loc 2 261 15 // standard.py:261:15 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
226
+ add.s64 %rd72, %rd78, %rd82;
227
+ .loc 2 291 36 // standard.py:291:36 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
228
+ and.b32 %r46, %r1, 899;
229
+ setp.eq.b32 %p18, %r46, 0;
230
+ // begin inline asm
231
+ @%p18 st.shared.b64 [ %r29 + 0 ], %rd72;
232
+ // end inline asm
233
+ bar.sync 0;
234
+ ld.shared.b32 %r31, [%r35];
235
+ $L__tmp4:
236
+ .loc 1 43 49 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:49
237
+ setp.lt.s64 %p21, %rd20, 2;
238
+ .loc 1 43 75 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:75
239
+ setp.gt.s64 %p22, %rd20, 1;
240
+ .loc 1 43 66 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:66
241
+ selp.b64 %rd83, %rd20, 0, %p22;
242
+ .loc 1 43 0 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43
243
+ selp.b64 %rd84, 1, 0, %p21;
244
+ .loc 1 43 57 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:57
245
+ add.s64 %rd85, %rd83, %rd84;
246
+ .loc 1 43 34 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:34
247
+ mul.lo.s64 %rd86, %rd91, %rd85;
248
+ .loc 1 43 25 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:25
249
+ add.s64 %rd88, %rd19, %rd90;
250
+ shl.b64 %rd89, %rd86, 2;
251
+ add.s64 %rd73, %rd88, %rd89;
252
+ .loc 1 43 88 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:88
253
+ and.b32 %r47, %r1, 96;
254
+ setp.eq.b32 %p23, %r47, 0;
255
+ and.pred %p19, %p23, %p20;
256
+ // begin inline asm
257
+ @%p19 st.global.b32 [ %rd73 + 0 ], { %r31 };
258
+ // end inline asm
259
+ .loc 1 43 4 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:4
260
+ ret;
261
+ $L__tmp5:
262
+ $L__func_end0:
263
+ // -- End function
264
+ }
265
+ .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py"
266
+ .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py"
267
+ .section .debug_abbrev
268
+ {
269
+ .b8 1 // Abbreviation Code
270
+ .b8 17 // DW_TAG_compile_unit
271
+ .b8 1 // DW_CHILDREN_yes
272
+ .b8 37 // DW_AT_producer
273
+ .b8 8 // DW_FORM_string
274
+ .b8 19 // DW_AT_language
275
+ .b8 5 // DW_FORM_data2
276
+ .b8 3 // DW_AT_name
277
+ .b8 8 // DW_FORM_string
278
+ .b8 16 // DW_AT_stmt_list
279
+ .b8 6 // DW_FORM_data4
280
+ .b8 27 // DW_AT_comp_dir
281
+ .b8 8 // DW_FORM_string
282
+ .b8 0 // EOM(1)
283
+ .b8 0 // EOM(2)
284
+ .b8 2 // Abbreviation Code
285
+ .b8 46 // DW_TAG_subprogram
286
+ .b8 0 // DW_CHILDREN_no
287
+ .b8 3 // DW_AT_name
288
+ .b8 8 // DW_FORM_string
289
+ .b8 32 // DW_AT_inline
290
+ .b8 11 // DW_FORM_data1
291
+ .b8 0 // EOM(1)
292
+ .b8 0 // EOM(2)
293
+ .b8 3 // Abbreviation Code
294
+ .b8 46 // DW_TAG_subprogram
295
+ .b8 1 // DW_CHILDREN_yes
296
+ .b8 17 // DW_AT_low_pc
297
+ .b8 1 // DW_FORM_addr
298
+ .b8 18 // DW_AT_high_pc
299
+ .b8 1 // DW_FORM_addr
300
+ .b8 49 // DW_AT_abstract_origin
301
+ .b8 19 // DW_FORM_ref4
302
+ .b8 0 // EOM(1)
303
+ .b8 0 // EOM(2)
304
+ .b8 4 // Abbreviation Code
305
+ .b8 29 // DW_TAG_inlined_subroutine
306
+ .b8 0 // DW_CHILDREN_no
307
+ .b8 49 // DW_AT_abstract_origin
308
+ .b8 19 // DW_FORM_ref4
309
+ .b8 17 // DW_AT_low_pc
310
+ .b8 1 // DW_FORM_addr
311
+ .b8 18 // DW_AT_high_pc
312
+ .b8 1 // DW_FORM_addr
313
+ .b8 88 // DW_AT_call_file
314
+ .b8 11 // DW_FORM_data1
315
+ .b8 89 // DW_AT_call_line
316
+ .b8 11 // DW_FORM_data1
317
+ .b8 87 // DW_AT_call_column
318
+ .b8 11 // DW_FORM_data1
319
+ .b8 0 // EOM(1)
320
+ .b8 0 // EOM(2)
321
+ .b8 0 // EOM(3)
322
+ }
323
+ .section .debug_info
324
+ {
325
+ .b32 238 // Length of Unit
326
+ .b8 2 // DWARF version number
327
+ .b8 0
328
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
329
+ .b8 8 // Address Size (in bytes)
330
+ .b8 1 // Abbrev [1] 0xb:0xe7 DW_TAG_compile_unit
331
+ .b8 116 // DW_AT_producer
332
+ .b8 114
333
+ .b8 105
334
+ .b8 116
335
+ .b8 111
336
+ .b8 110
337
+ .b8 0
338
+ .b8 2 // DW_AT_language
339
+ .b8 0
340
+ .b8 99 // DW_AT_name
341
+ .b8 120
342
+ .b8 116
343
+ .b8 109
344
+ .b8 114
345
+ .b8 99
346
+ .b8 110
347
+ .b8 54
348
+ .b8 110
349
+ .b8 100
350
+ .b8 103
351
+ .b8 104
352
+ .b8 117
353
+ .b8 107
354
+ .b8 102
355
+ .b8 104
356
+ .b8 52
357
+ .b8 50
358
+ .b8 99
359
+ .b8 108
360
+ .b8 117
361
+ .b8 122
362
+ .b8 105
363
+ .b8 119
364
+ .b8 101
365
+ .b8 111
366
+ .b8 97
367
+ .b8 122
368
+ .b8 102
369
+ .b8 109
370
+ .b8 102
371
+ .b8 114
372
+ .b8 122
373
+ .b8 51
374
+ .b8 106
375
+ .b8 101
376
+ .b8 111
377
+ .b8 112
378
+ .b8 99
379
+ .b8 114
380
+ .b8 114
381
+ .b8 111
382
+ .b8 109
383
+ .b8 119
384
+ .b8 118
385
+ .b8 106
386
+ .b8 53
387
+ .b8 109
388
+ .b8 51
389
+ .b8 108
390
+ .b8 115
391
+ .b8 106
392
+ .b8 46
393
+ .b8 112
394
+ .b8 121
395
+ .b8 0
396
+ .b32 .debug_line // DW_AT_stmt_list
397
+ .b8 47 // DW_AT_comp_dir
398
+ .b8 119
399
+ .b8 111
400
+ .b8 114
401
+ .b8 107
402
+ .b8 115
403
+ .b8 112
404
+ .b8 97
405
+ .b8 99
406
+ .b8 101
407
+ .b8 47
408
+ .b8 104
409
+ .b8 97
410
+ .b8 110
411
+ .b8 114
412
+ .b8 117
413
+ .b8 105
414
+ .b8 47
415
+ .b8 83
416
+ .b8 112
417
+ .b8 101
418
+ .b8 99
419
+ .b8 70
420
+ .b8 111
421
+ .b8 114
422
+ .b8 103
423
+ .b8 101
424
+ .b8 45
425
+ .b8 101
426
+ .b8 120
427
+ .b8 116
428
+ .b8 47
429
+ .b8 99
430
+ .b8 97
431
+ .b8 99
432
+ .b8 104
433
+ .b8 101
434
+ .b8 47
435
+ .b8 99
436
+ .b8 111
437
+ .b8 109
438
+ .b8 112
439
+ .b8 105
440
+ .b8 108
441
+ .b8 101
442
+ .b8 100
443
+ .b8 95
444
+ .b8 107
445
+ .b8 101
446
+ .b8 114
447
+ .b8 110
448
+ .b8 101
449
+ .b8 108
450
+ .b8 115
451
+ .b8 47
452
+ .b8 120
453
+ .b8 116
454
+ .b8 0
455
+ .b8 2 // Abbrev [2] 0x8b:0x38 DW_TAG_subprogram
456
+ .b8 116 // DW_AT_name
457
+ .b8 114
458
+ .b8 105
459
+ .b8 116
460
+ .b8 111
461
+ .b8 110
462
+ .b8 95
463
+ .b8 114
464
+ .b8 101
465
+ .b8 100
466
+ .b8 95
467
+ .b8 102
468
+ .b8 117
469
+ .b8 115
470
+ .b8 101
471
+ .b8 100
472
+ .b8 95
473
+ .b8 95
474
+ .b8 116
475
+ .b8 111
476
+ .b8 95
477
+ .b8 99
478
+ .b8 111
479
+ .b8 112
480
+ .b8 121
481
+ .b8 95
482
+ .b8 99
483
+ .b8 108
484
+ .b8 111
485
+ .b8 110
486
+ .b8 101
487
+ .b8 95
488
+ .b8 115
489
+ .b8 108
490
+ .b8 105
491
+ .b8 99
492
+ .b8 101
493
+ .b8 95
494
+ .b8 115
495
+ .b8 117
496
+ .b8 109
497
+ .b8 95
498
+ .b8 116
499
+ .b8 114
500
+ .b8 97
501
+ .b8 110
502
+ .b8 115
503
+ .b8 112
504
+ .b8 111
505
+ .b8 115
506
+ .b8 101
507
+ .b8 95
508
+ .b8 53
509
+ .b8 0
510
+ .b8 1 // DW_AT_inline
511
+ .b8 3 // Abbrev [3] 0xc3:0x2e DW_TAG_subprogram
512
+ .b64 $L__func_begin0 // DW_AT_low_pc
513
+ .b64 $L__func_end0 // DW_AT_high_pc
514
+ .b32 139 // DW_AT_abstract_origin
515
+ .b8 4 // Abbrev [4] 0xd8:0x18 DW_TAG_inlined_subroutine
516
+ .b32 139 // DW_AT_abstract_origin
517
+ .b64 $L__tmp1 // DW_AT_low_pc
518
+ .b64 $L__tmp4 // DW_AT_high_pc
519
+ .b8 1 // DW_AT_call_file
520
+ .b8 41 // DW_AT_call_line
521
+ .b8 25 // DW_AT_call_column
522
+ .b8 0 // End Of Children Mark
523
+ .b8 0 // End Of Children Mark
524
+ }
525
+ .section .debug_macinfo { }
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":18:0)
2
+ #loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
3
+ #loc43 = loc(unknown)
4
+ #loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
5
+ #loc50 = loc("in_ptr0"(#loc))
6
+ #loc51 = loc("out_ptr1"(#loc))
7
+ #loc52 = loc("ks0"(#loc))
8
+ #loc53 = loc("ks1"(#loc))
9
+ #loc54 = loc("xnumel"(#loc))
10
+ #loc55 = loc("r0_numel"(#loc))
11
+ #loc85 = loc("input"(#loc41))
12
+ #loc86 = loc("a"(#loc46))
13
+ #loc87 = loc("b"(#loc46))
14
+ module {
15
+ tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} {
16
+ %xoffset = tt.get_program_id x : i32 loc(#loc56)
17
+ %xoffset_0 = arith.constant 32 : i32 loc(#loc57)
18
+ %xoffset_1 = arith.constant 32 : i32 loc(#loc57)
19
+ %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc57)
20
+ %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc58)
21
+ %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc59)
22
+ %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<32x1xi32> loc(#loc60)
23
+ %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<32x1xi32> loc(#loc60)
24
+ %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32> loc(#loc61)
25
+ %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<32x1xi32> loc(#loc61)
26
+ %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc62)
27
+ %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc63)
28
+ %x0 = arith.extsi %xindex_5 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc64)
29
+ %x0_8 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc64)
30
+ %x0_9 = arith.remsi %x0, %x0_8 : tensor<32x1xi64> loc(#loc64)
31
+ %x1 = arith.extsi %xindex_5 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc65)
32
+ %x1_10 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc65)
33
+ %x1_11 = arith.divsi %x1, %x1_10 : tensor<32x1xi64> loc(#loc65)
34
+ %_tmp3 = arith.constant 0 : i64 loc(#loc66)
35
+ %_tmp3_12 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc66)
36
+ %c0_i32 = arith.constant 0 : i32 loc(#loc12)
37
+ %c16_i32 = arith.constant 16 : i32 loc(#loc12)
38
+ %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12)
39
+ %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc12)
40
+ %2 = arith.bitcast %c16_i32 : i32 to i32 loc(#loc12)
41
+ %3 = ub.poison : i32 loc(#loc12)
42
+ %_tmp3_13 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_18 = %_tmp3_12) -> (tensor<32x16xi64>) : i32 {
43
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc68)
44
+ %r0_index_19 = arith.addi %r0_index, %r0_base_7 : tensor<1x16xi32> loc(#loc68)
45
+ %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc69)
46
+ %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32> loc(#loc69)
47
+ %tmp0 = arith.extsi %r0_index_19 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc70)
48
+ %tmp0_21 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc70)
49
+ %tmp0_22 = arith.muli %tmp0_21, %tmp0 : tensor<1x16xi64> loc(#loc70)
50
+ %tmp0_23 = tt.broadcast %x0_9 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc71)
51
+ %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x16xi64> -> tensor<32x16xi64> loc(#loc71)
52
+ %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<32x16xi64> loc(#loc71)
53
+ %tmp0_26 = arith.muli %ks0, %ks1 : i64 loc(#loc72)
54
+ %tmp0_27 = tt.splat %tmp0_26 : i64 -> tensor<32x1xi64> loc(#loc73)
55
+ %tmp0_28 = arith.muli %tmp0_27, %x1_11 : tensor<32x1xi64> loc(#loc73)
56
+ %tmp0_29 = tt.broadcast %tmp0_28 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc74)
57
+ %tmp0_30 = arith.addi %tmp0_25, %tmp0_29 : tensor<32x16xi64> loc(#loc74)
58
+ %tmp0_31 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>> loc(#loc75)
59
+ %tmp0_32 = tt.addptr %tmp0_31, %tmp0_30 : tensor<32x16x!tt.ptr<i32>>, tensor<32x16xi64> loc(#loc75)
60
+ %tmp0_33 = tt.broadcast %r0_mask_20 : tensor<1x16xi1> -> tensor<32x16xi1> loc(#loc76)
61
+ %tmp0_34 = tt.broadcast %xmask_6 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc76)
62
+ %tmp0_35 = arith.andi %tmp0_33, %tmp0_34 : tensor<32x16xi1> loc(#loc76)
63
+ %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc77)
64
+ %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<32x16xf32> loc(#loc77)
65
+ %tmp0_38 = arith.fptosi %tmp0_37 : tensor<32x16xf32> to tensor<32x16xi32> loc(#loc77)
66
+ %tmp0_39 = tt.load %tmp0_32, %tmp0_35, %tmp0_38 evictionPolicy = evict_last : tensor<32x16x!tt.ptr<i32>> loc(#loc77)
67
+ %tmp1 = arith.extsi %tmp0_39 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc78)
68
+ %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<32x16xi64> loc(#loc79)
69
+ %_tmp3_40 = tt.broadcast %r0_mask_20 : tensor<1x16xi1> -> tensor<32x16xi1> loc(#loc80)
70
+ %_tmp3_41 = tt.broadcast %xmask_6 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc80)
71
+ %_tmp3_42 = arith.andi %_tmp3_40, %_tmp3_41 : tensor<32x16xi1> loc(#loc80)
72
+ %_tmp3_43 = arith.select %_tmp3_42, %tmp4, %_tmp3_18 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc81)
73
+ scf.yield %_tmp3_43 : tensor<32x16xi64> loc(#loc27)
74
+ } loc(#loc67)
75
+ %tmp3 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_13) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc82)
76
+ %tmp3_14 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc83)
77
+ %tmp5 = arith.trunci %tmp3_14 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc84)
78
+ %c1_i32 = arith.constant 1 : i32 loc(#loc31)
79
+ %4 = arith.extsi %c1_i32 : i32 to i64 loc(#loc31)
80
+ %5 = arith.cmpi sge, %4, %ks0 : i64 loc(#loc31)
81
+ %c1_i32_15 = arith.constant 1 : i32 loc(#loc32)
82
+ %c1_i32_16 = arith.constant 1 : i32 loc(#loc32)
83
+ %6 = arith.extui %5 : i1 to i32 loc(#loc32)
84
+ %7 = arith.muli %c1_i32_16, %6 : i32 loc(#loc32)
85
+ %c1_i32_17 = arith.constant 1 : i32 loc(#loc33)
86
+ %8 = arith.extsi %c1_i32_17 : i32 to i64 loc(#loc33)
87
+ %9 = arith.cmpi sgt, %ks0, %8 : i64 loc(#loc33)
88
+ %10 = arith.extui %9 : i1 to i64 loc(#loc34)
89
+ %11 = arith.muli %ks0, %10 : i64 loc(#loc34)
90
+ %12 = arith.extsi %7 : i32 to i64 loc(#loc35)
91
+ %13 = arith.addi %12, %11 : i64 loc(#loc35)
92
+ %14 = tt.splat %13 : i64 -> tensor<32x1xi64> loc(#loc36)
93
+ %15 = arith.muli %x1_11, %14 : tensor<32x1xi64> loc(#loc36)
94
+ %16 = arith.addi %x0_9, %15 : tensor<32x1xi64> loc(#loc37)
95
+ %17 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>> loc(#loc38)
96
+ %18 = tt.addptr %17, %16 : tensor<32x1x!tt.ptr<i32>>, tensor<32x1xi64> loc(#loc38)
97
+ tt.store %18, %tmp5, %xmask_6 : tensor<32x1x!tt.ptr<i32>> loc(#loc39)
98
+ tt.return loc(#loc40)
99
+ } loc(#loc)
100
+ tt.func private @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x16xi64> loc("input"(#loc41))) -> tensor<32xi64> attributes {noinline = false} {
101
+ %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
102
+ ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)):
103
+ %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc42)
104
+ tt.reduce.return %2 : i64 loc(#loc42)
105
+ }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc42)
106
+ tt.return %0 : tensor<32xi64> loc(#loc44)
107
+ ^bb1: // no predecessors
108
+ %1 = ub.poison : tensor<32xi64> loc(#loc45)
109
+ tt.return %1 : tensor<32xi64> loc(#loc45)
110
+ } loc(#loc41)
111
+ tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc46)), %b: i64 loc("b"(#loc46))) -> i64 attributes {noinline = false} {
112
+ %0 = arith.addi %a, %b : i64 loc(#loc47)
113
+ tt.return %0 : i64 loc(#loc48)
114
+ ^bb1: // no predecessors
115
+ %1 = ub.poison : i64 loc(#loc49)
116
+ tt.return %1 : i64 loc(#loc49)
117
+ } loc(#loc46)
118
+ } loc(#loc)
119
+ #loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:28)
120
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:33)
121
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:36)
122
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:44)
123
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:23)
124
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":23:21)
125
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":24:27)
126
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":24:37)
127
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":26:19)
128
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":27:19)
129
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":28:43)
130
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":30:40)
131
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":31:31)
132
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":32:29)
133
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:43)
134
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:39)
135
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:54)
136
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:58)
137
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:50)
138
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:34)
139
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:73)
140
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:63)
141
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":37:23)
142
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":39:23)
143
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:35)
144
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:48)
145
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:8)
146
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:25)
147
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:28)
148
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":42:19)
149
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:49)
150
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:41)
151
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:75)
152
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:66)
153
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:57)
154
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:34)
155
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:30)
156
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:25)
157
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:88)
158
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:4)
159
+ #loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
160
+ #loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
161
+ #loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
162
+ #loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
163
+ #loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
164
+ #loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
165
+ #loc56 = loc("xoffset"(#loc1))
166
+ #loc57 = loc("xoffset"(#loc2))
167
+ #loc58 = loc("xindex"(#loc3))
168
+ #loc59 = loc("xindex"(#loc4))
169
+ #loc60 = loc("xindex"(#loc5))
170
+ #loc61 = loc("xmask"(#loc6))
171
+ #loc62 = loc("r0_base"(#loc7))
172
+ #loc63 = loc("r0_base"(#loc8))
173
+ #loc64 = loc("x0"(#loc9))
174
+ #loc65 = loc("x1"(#loc10))
175
+ #loc66 = loc("_tmp3"(#loc11))
176
+ #loc67 = loc("_tmp3"(#loc12))
177
+ #loc68 = loc("r0_index"(#loc13))
178
+ #loc69 = loc("r0_mask"(#loc14))
179
+ #loc70 = loc("tmp0"(#loc15))
180
+ #loc71 = loc("tmp0"(#loc16))
181
+ #loc72 = loc("tmp0"(#loc17))
182
+ #loc73 = loc("tmp0"(#loc18))
183
+ #loc74 = loc("tmp0"(#loc19))
184
+ #loc75 = loc("tmp0"(#loc20))
185
+ #loc76 = loc("tmp0"(#loc21))
186
+ #loc77 = loc("tmp0"(#loc22))
187
+ #loc78 = loc("tmp1"(#loc23))
188
+ #loc79 = loc("tmp4"(#loc24))
189
+ #loc80 = loc("_tmp3"(#loc25))
190
+ #loc81 = loc("_tmp3"(#loc26))
191
+ #loc82 = loc("tmp3"(#loc28))
192
+ #loc83 = loc("tmp3"(#loc29))
193
+ #loc84 = loc("tmp5"(#loc30))
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
2
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":18:0)
3
+ #loc1 = loc(unknown)
4
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:25)
5
+ #loc40 = loc("in_ptr0"(#loc))
6
+ #loc41 = loc("out_ptr1"(#loc))
7
+ #loc42 = loc("ks0"(#loc))
8
+ #loc43 = loc("ks1"(#loc))
9
+ #loc44 = loc("xnumel"(#loc))
10
+ #loc45 = loc("r0_numel"(#loc))
11
+ #loc68 = loc("tmp3"(#loc26))
12
+ #loc73 = loc(callsite(#loc1 at #loc68))
13
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
14
+ tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} {
15
+ %cst = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1)
16
+ %c0_i32 = arith.constant 0 : i32 loc(#loc1)
17
+ %c16_i32 = arith.constant 16 : i32 loc(#loc1)
18
+ %c1_i64 = arith.constant 1 : i64 loc(#loc1)
19
+ %c32_i32 = arith.constant 32 : i32 loc(#loc1)
20
+ %cst_0 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1)
21
+ %xoffset = tt.get_program_id x : i32 loc(#loc46)
22
+ %xoffset_1 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc47)
23
+ %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48)
24
+ %xindex_2 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc48)
25
+ %xindex_3 = tt.splat %xoffset_1 : i32 -> tensor<32x1xi32, #blocked> loc(#loc49)
26
+ %xindex_4 = arith.addi %xindex_3, %xindex_2 : tensor<32x1xi32, #blocked> loc(#loc49)
27
+ %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32, #blocked> loc(#loc50)
28
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<32x1xi32, #blocked> loc(#loc50)
29
+ %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc51)
30
+ %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc51)
31
+ %x0 = arith.extsi %xindex_4 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> loc(#loc52)
32
+ %x0_7 = tt.splat %ks0 : i64 -> tensor<32x1xi64, #blocked> loc(#loc52)
33
+ %x0_8 = arith.remsi %x0, %x0_7 : tensor<32x1xi64, #blocked> loc(#loc52)
34
+ %x1 = arith.divsi %x0, %x0_7 : tensor<32x1xi64, #blocked> loc(#loc53)
35
+ %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32, #blocked> loc(#loc54)
36
+ %tmp0 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked> loc(#loc55)
37
+ %tmp0_9 = tt.broadcast %x0_8 : tensor<32x1xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc56)
38
+ %tmp0_10 = arith.muli %ks0, %ks1 : i64 loc(#loc57)
39
+ %tmp0_11 = tt.splat %tmp0_10 : i64 -> tensor<32x1xi64, #blocked> loc(#loc58)
40
+ %tmp0_12 = arith.muli %tmp0_11, %x1 : tensor<32x1xi64, #blocked> loc(#loc58)
41
+ %tmp0_13 = tt.broadcast %tmp0_12 : tensor<32x1xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc59)
42
+ %tmp0_14 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>, #blocked> loc(#loc60)
43
+ %tmp0_15 = tt.broadcast %xmask_5 : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc61)
44
+ %_tmp3 = scf.for %_tmp3_17 = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_18 = %cst) -> (tensor<32x16xi64, #blocked>) : i32 {
45
+ %r0_index = tt.splat %_tmp3_17 : i32 -> tensor<1x16xi32, #blocked> loc(#loc63)
46
+ %r0_index_19 = arith.addi %r0_index, %r0_base_6 : tensor<1x16xi32, #blocked> loc(#loc63)
47
+ %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32, #blocked> loc(#loc54)
48
+ %tmp0_21 = arith.extsi %r0_index_19 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc55)
49
+ %tmp0_22 = arith.muli %tmp0, %tmp0_21 : tensor<1x16xi64, #blocked> loc(#loc55)
50
+ %tmp0_23 = tt.broadcast %tmp0_22 : tensor<1x16xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc56)
51
+ %tmp0_24 = arith.addi %tmp0_9, %tmp0_23 : tensor<32x16xi64, #blocked> loc(#loc56)
52
+ %tmp0_25 = arith.addi %tmp0_24, %tmp0_13 : tensor<32x16xi64, #blocked> loc(#loc59)
53
+ %tmp0_26 = tt.addptr %tmp0_14, %tmp0_25 : tensor<32x16x!tt.ptr<i32>, #blocked>, tensor<32x16xi64, #blocked> loc(#loc60)
54
+ %tmp0_27 = tt.broadcast %r0_mask_20 : tensor<1x16xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc61)
55
+ %tmp0_28 = arith.andi %tmp0_27, %tmp0_15 : tensor<32x16xi1, #blocked> loc(#loc61)
56
+ %tmp0_29 = tt.load %tmp0_26, %tmp0_28, %cst_0 evictionPolicy = evict_last : tensor<32x16x!tt.ptr<i32>, #blocked> loc(#loc64)
57
+ %tmp1 = arith.extsi %tmp0_29 : tensor<32x16xi32, #blocked> to tensor<32x16xi64, #blocked> loc(#loc65)
58
+ %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<32x16xi64, #blocked> loc(#loc66)
59
+ %_tmp3_30 = arith.select %tmp0_28, %tmp4, %_tmp3_18 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc67)
60
+ scf.yield %_tmp3_30 : tensor<32x16xi64, #blocked> loc(#loc24)
61
+ } loc(#loc62)
62
+ %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({
63
+ ^bb0(%tmp3_17: i64 loc(callsite(#loc1 at #loc68)), %tmp3_18: i64 loc(callsite(#loc1 at #loc68))):
64
+ %tmp3_19 = arith.addi %tmp3_17, %tmp3_18 : i64 loc(#loc74)
65
+ tt.reduce.return %tmp3_19 : i64 loc(#loc72)
66
+ }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc72)
67
+ %tmp3_16 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc69)
68
+ %tmp5 = arith.trunci %tmp3_16 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc70)
69
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc30)
70
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc31)
71
+ %2 = arith.extui %1 : i1 to i64 loc(#loc32)
72
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc32)
73
+ %4 = arith.extui %0 : i1 to i64 loc(#loc71)
74
+ %5 = arith.addi %4, %3 : i64 loc(#loc33)
75
+ %6 = tt.splat %5 : i64 -> tensor<32x1xi64, #blocked> loc(#loc35)
76
+ %7 = arith.muli %x1, %6 : tensor<32x1xi64, #blocked> loc(#loc35)
77
+ %8 = arith.addi %x0_8, %7 : tensor<32x1xi64, #blocked> loc(#loc36)
78
+ %9 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>, #blocked> loc(#loc37)
79
+ %10 = tt.addptr %9, %8 : tensor<32x1x!tt.ptr<i32>, #blocked>, tensor<32x1xi64, #blocked> loc(#loc37)
80
+ tt.store %10, %tmp5, %xmask_5 : tensor<32x1x!tt.ptr<i32>, #blocked> loc(#loc38)
81
+ tt.return loc(#loc39)
82
+ } loc(#loc)
83
+ } loc(#loc)
84
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:28)
85
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:33)
86
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:44)
87
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:23)
88
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":23:21)
89
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":24:37)
90
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":26:19)
91
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":27:19)
92
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":32:29)
93
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:43)
94
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:39)
95
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:54)
96
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:58)
97
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:50)
98
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:34)
99
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:73)
100
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":30:40)
101
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":31:31)
102
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:63)
103
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":37:23)
104
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":39:23)
105
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:48)
106
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:8)
107
+ #loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
108
+ #loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
109
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:28)
110
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":42:19)
111
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:49)
112
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:75)
113
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:66)
114
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:57)
115
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:41)
116
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:34)
117
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:30)
118
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:25)
119
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:88)
120
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:4)
121
+ #loc46 = loc("xoffset"(#loc2))
122
+ #loc47 = loc("xoffset"(#loc3))
123
+ #loc48 = loc("xindex"(#loc4))
124
+ #loc49 = loc("xindex"(#loc5))
125
+ #loc50 = loc("xmask"(#loc6))
126
+ #loc51 = loc("r0_base"(#loc7))
127
+ #loc52 = loc("x0"(#loc8))
128
+ #loc53 = loc("x1"(#loc9))
129
+ #loc54 = loc("r0_mask"(#loc10))
130
+ #loc55 = loc("tmp0"(#loc11))
131
+ #loc56 = loc("tmp0"(#loc12))
132
+ #loc57 = loc("tmp0"(#loc13))
133
+ #loc58 = loc("tmp0"(#loc14))
134
+ #loc59 = loc("tmp0"(#loc15))
135
+ #loc60 = loc("tmp0"(#loc16))
136
+ #loc61 = loc("tmp0"(#loc17))
137
+ #loc62 = loc("_tmp3"(#loc18))
138
+ #loc63 = loc("r0_index"(#loc19))
139
+ #loc64 = loc("tmp0"(#loc20))
140
+ #loc65 = loc("tmp1"(#loc21))
141
+ #loc66 = loc("tmp4"(#loc22))
142
+ #loc67 = loc("_tmp3"(#loc23))
143
+ #loc69 = loc("tmp3"(#loc28))
144
+ #loc70 = loc("tmp5"(#loc29))
145
+ #loc71 = loc(fused[#loc33, #loc34])
146
+ #loc72 = loc(callsite(#loc25 at #loc68))
147
+ #loc74 = loc(callsite(#loc27 at #loc72))
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:25)
4
+ #loc43 = loc("in_ptr0"(#loc))
5
+ #loc44 = loc("out_ptr1"(#loc))
6
+ #loc45 = loc("ks0"(#loc))
7
+ #loc46 = loc("ks1"(#loc))
8
+ #loc47 = loc("xnumel"(#loc))
9
+ #loc48 = loc("r0_numel"(#loc))
10
+ #loc74 = loc("tmp3"(#loc29))
11
+ #loc79 = loc(callsite(#loc1 at #loc74))
12
+ module {
13
+ tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} {
14
+ %c1_i64 = arith.constant 1 : i64 loc(#loc1)
15
+ %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1)
16
+ %c16_i32 = arith.constant 16 : i32 loc(#loc2)
17
+ %c0_i32 = arith.constant 0 : i32 loc(#loc2)
18
+ %_tmp3 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc49)
19
+ %c32_i32 = arith.constant 32 : i32 loc(#loc1)
20
+ %xoffset = tt.get_program_id x : i32 loc(#loc50)
21
+ %xoffset_0 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc51)
22
+ %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc52)
23
+ %xindex_1 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc53)
24
+ %xindex_2 = tt.splat %xoffset_0 : i32 -> tensor<32x1xi32> loc(#loc54)
25
+ %xindex_3 = arith.addi %xindex_2, %xindex_1 : tensor<32x1xi32> loc(#loc54)
26
+ %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32> loc(#loc55)
27
+ %xmask_4 = arith.cmpi slt, %xindex_3, %xmask : tensor<32x1xi32> loc(#loc55)
28
+ %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc56)
29
+ %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc57)
30
+ %x0 = arith.extsi %xindex_3 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc58)
31
+ %x0_6 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc58)
32
+ %x0_7 = arith.remsi %x0, %x0_6 : tensor<32x1xi64> loc(#loc58)
33
+ %x1 = arith.divsi %x0, %x0_6 : tensor<32x1xi64> loc(#loc59)
34
+ %_tmp3_8 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_10 = %_tmp3) -> (tensor<32x16xi64>) : i32 {
35
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc61)
36
+ %r0_index_11 = arith.addi %r0_index, %r0_base_5 : tensor<1x16xi32> loc(#loc61)
37
+ %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc62)
38
+ %r0_mask_12 = arith.cmpi slt, %r0_index_11, %r0_mask : tensor<1x16xi32> loc(#loc62)
39
+ %tmp0 = arith.extsi %r0_index_11 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc63)
40
+ %tmp0_13 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc63)
41
+ %tmp0_14 = arith.muli %tmp0_13, %tmp0 : tensor<1x16xi64> loc(#loc63)
42
+ %tmp0_15 = tt.broadcast %x0_7 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc64)
43
+ %tmp0_16 = tt.broadcast %tmp0_14 : tensor<1x16xi64> -> tensor<32x16xi64> loc(#loc64)
44
+ %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<32x16xi64> loc(#loc64)
45
+ %tmp0_18 = arith.muli %ks0, %ks1 : i64 loc(#loc65)
46
+ %tmp0_19 = tt.splat %tmp0_18 : i64 -> tensor<32x1xi64> loc(#loc66)
47
+ %tmp0_20 = arith.muli %tmp0_19, %x1 : tensor<32x1xi64> loc(#loc66)
48
+ %tmp0_21 = tt.broadcast %tmp0_20 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc67)
49
+ %tmp0_22 = arith.addi %tmp0_17, %tmp0_21 : tensor<32x16xi64> loc(#loc67)
50
+ %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>> loc(#loc68)
51
+ %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<32x16x!tt.ptr<i32>>, tensor<32x16xi64> loc(#loc68)
52
+ %tmp0_25 = tt.broadcast %r0_mask_12 : tensor<1x16xi1> -> tensor<32x16xi1> loc(#loc69)
53
+ %tmp0_26 = tt.broadcast %xmask_4 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc69)
54
+ %tmp0_27 = arith.andi %tmp0_25, %tmp0_26 : tensor<32x16xi1> loc(#loc69)
55
+ %tmp0_28 = tt.load %tmp0_24, %tmp0_27, %cst evictionPolicy = evict_last : tensor<32x16x!tt.ptr<i32>> loc(#loc70)
56
+ %tmp1 = arith.extsi %tmp0_28 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc71)
57
+ %tmp4 = arith.addi %_tmp3_10, %tmp1 : tensor<32x16xi64> loc(#loc72)
58
+ %_tmp3_29 = arith.select %tmp0_27, %tmp4, %_tmp3_10 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc73)
59
+ scf.yield %_tmp3_29 : tensor<32x16xi64> loc(#loc27)
60
+ } loc(#loc60)
61
+ %tmp3 = "tt.reduce"(%_tmp3_8) <{axis = 1 : i32}> ({
62
+ ^bb0(%tmp3_10: i64 loc(callsite(#loc1 at #loc74)), %tmp3_11: i64 loc(callsite(#loc1 at #loc74))):
63
+ %tmp3_12 = arith.addi %tmp3_10, %tmp3_11 : i64 loc(#loc80)
64
+ tt.reduce.return %tmp3_12 : i64 loc(#loc78)
65
+ }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc78)
66
+ %tmp3_9 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc75)
67
+ %tmp5 = arith.trunci %tmp3_9 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc76)
68
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc33)
69
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc34)
70
+ %2 = arith.extui %1 : i1 to i64 loc(#loc35)
71
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc35)
72
+ %4 = arith.extui %0 : i1 to i64 loc(#loc77)
73
+ %5 = arith.addi %4, %3 : i64 loc(#loc36)
74
+ %6 = tt.splat %5 : i64 -> tensor<32x1xi64> loc(#loc38)
75
+ %7 = arith.muli %x1, %6 : tensor<32x1xi64> loc(#loc38)
76
+ %8 = arith.addi %x0_7, %7 : tensor<32x1xi64> loc(#loc39)
77
+ %9 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>> loc(#loc40)
78
+ %10 = tt.addptr %9, %8 : tensor<32x1x!tt.ptr<i32>>, tensor<32x1xi64> loc(#loc40)
79
+ tt.store %10, %tmp5, %xmask_4 : tensor<32x1x!tt.ptr<i32>> loc(#loc41)
80
+ tt.return loc(#loc42)
81
+ } loc(#loc)
82
+ } loc(#loc)
83
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":30:40)
84
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":28:43)
85
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:28)
86
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:33)
87
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:36)
88
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:44)
89
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:23)
90
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":23:21)
91
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":24:27)
92
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":24:37)
93
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":26:19)
94
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":27:19)
95
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":31:31)
96
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":32:29)
97
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:43)
98
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:39)
99
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:54)
100
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:58)
101
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:50)
102
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:34)
103
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:73)
104
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:63)
105
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":37:23)
106
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":39:23)
107
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:48)
108
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:8)
109
+ #loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
110
+ #loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
111
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:28)
112
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":42:19)
113
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:49)
114
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:75)
115
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:66)
116
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:57)
117
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:41)
118
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:34)
119
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:30)
120
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:25)
121
+ #loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:88)
122
+ #loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:4)
123
+ #loc49 = loc("_tmp3"(#loc3))
124
+ #loc50 = loc("xoffset"(#loc4))
125
+ #loc51 = loc("xoffset"(#loc5))
126
+ #loc52 = loc("xindex"(#loc6))
127
+ #loc53 = loc("xindex"(#loc7))
128
+ #loc54 = loc("xindex"(#loc8))
129
+ #loc55 = loc("xmask"(#loc9))
130
+ #loc56 = loc("r0_base"(#loc10))
131
+ #loc57 = loc("r0_base"(#loc11))
132
+ #loc58 = loc("x0"(#loc12))
133
+ #loc59 = loc("x1"(#loc13))
134
+ #loc60 = loc("_tmp3"(#loc2))
135
+ #loc61 = loc("r0_index"(#loc14))
136
+ #loc62 = loc("r0_mask"(#loc15))
137
+ #loc63 = loc("tmp0"(#loc16))
138
+ #loc64 = loc("tmp0"(#loc17))
139
+ #loc65 = loc("tmp0"(#loc18))
140
+ #loc66 = loc("tmp0"(#loc19))
141
+ #loc67 = loc("tmp0"(#loc20))
142
+ #loc68 = loc("tmp0"(#loc21))
143
+ #loc69 = loc("tmp0"(#loc22))
144
+ #loc70 = loc("tmp0"(#loc23))
145
+ #loc71 = loc("tmp1"(#loc24))
146
+ #loc72 = loc("tmp4"(#loc25))
147
+ #loc73 = loc("_tmp3"(#loc26))
148
+ #loc75 = loc("tmp3"(#loc31))
149
+ #loc76 = loc("tmp5"(#loc32))
150
+ #loc77 = loc(fused[#loc36, #loc37])
151
+ #loc78 = loc(callsite(#loc28 at #loc74))
152
+ #loc80 = loc(callsite(#loc30 at #loc78))
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/__grp__triton_tem_fused_zeros_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_tem_fused_zeros_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.source", "triton_tem_fused_zeros_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttir", "triton_tem_fused_zeros_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttgir", "triton_tem_fused_zeros_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.llir", "triton_tem_fused_zeros_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ptx", "triton_tem_fused_zeros_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.cubin", "triton_tem_fused_zeros_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.json"}}
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "d27b4d62c7c8c987ce65c6725faf9fd300a6f1f86eddda35855b227ed6dece60", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 164864, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_zeros_1"}
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.llir ADDED
The diff for this file is too large to render. See raw diff
 
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ptx ADDED
The diff for this file is too large to render. See raw diff
 
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.source ADDED
The diff for this file is too large to render. See raw diff
 
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttgir ADDED
The diff for this file is too large to render. See raw diff
 
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttir ADDED
The diff for this file is too large to render. See raw diff
 
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json"}}
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin ADDED
Binary file (33 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "db4fe3a24d03312f3dea8e5ee7eb1a7c6d73f10bc6de254a208984a64d7fe777", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1"}
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @global_smem = external addrspace(3) global [0 x i8], align 16
6
+
7
+ ; Function Attrs: nounwind
8
+ define ptx_kernel void @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, i32 %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #0 !dbg !4 {
9
+ %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
10
+ %15 = icmp slt i32 %14, %9, !dbg !8
11
+ %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
12
+ %17 = and i32 %16, 511, !dbg !9
13
+ %18 = zext nneg i32 %14 to i64, !dbg !10
14
+ %.frozen = freeze i64 %3, !dbg !10
15
+ %19 = sdiv i64 %18, %.frozen, !dbg !10
16
+ %20 = srem i64 %19, %4, !dbg !11
17
+ %21 = mul i64 %19, %.frozen, !dbg !12
18
+ %.decomposed = sub i64 %18, %21, !dbg !12
19
+ %22 = sdiv i64 %18, %7, !dbg !13
20
+ %23 = shl nsw i64 %20, 7, !dbg !14
21
+ %24 = shl nuw nsw i64 %.decomposed, 7, !dbg !15
22
+ %25 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !16
23
+ %26 = and i32 %16, 127
24
+ %27 = zext nneg i32 %26 to i64
25
+ %28 = or disjoint i64 %24, %27
26
+ %29 = icmp slt i64 %28, %6
27
+ %30 = icmp sge i64 %28, %8
28
+ %31 = tail call i64 @llvm.smin.i64(i64 %8, i64 0)
29
+ %32 = sub nsw i64 %.decomposed, %20
30
+ %33 = shl nsw i64 %32, 7
31
+ %34 = zext nneg i32 %17 to i64, !dbg !17
32
+ %35 = zext nneg i32 %26 to i64, !dbg !17
33
+ %36 = zext nneg i32 %16 to i64, !dbg !17
34
+ %37 = insertelement <4 x i1> poison, i1 %15, i64 0, !dbg !18
35
+ %38 = shufflevector <4 x i1> %37, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !18
36
+ %39 = insertelement <4 x i1> poison, i1 %29, i64 0, !dbg !19
37
+ %40 = shufflevector <4 x i1> %39, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !19
38
+ %41 = insertelement <4 x i64> poison, i64 %23, i64 0, !dbg !20
39
+ %42 = shufflevector <4 x i64> %41, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !20
40
+ %43 = insertelement <4 x i64> poison, i64 %5, i64 0, !dbg !21
41
+ %44 = shufflevector <4 x i64> %43, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !21
42
+ %45 = insertelement <4 x i64> poison, i64 %28, i64 0, !dbg !22
43
+ %46 = shufflevector <4 x i64> %45, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !22
44
+ %47 = insertelement <4 x i1> poison, i1 %30, i64 0, !dbg !23
45
+ %48 = shufflevector <4 x i1> %47, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !23
46
+ %49 = insertelement <4 x i64> poison, i64 %33, i64 0, !dbg !24
47
+ %50 = shufflevector <4 x i64> %49, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !24
48
+ %51 = insertelement <4 x i64> poison, i64 %8, i64 0, !dbg !25
49
+ %52 = shufflevector <4 x i64> %51, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !25
50
+ br label %53, !dbg !17
51
+
52
+ 53: ; preds = %13, %53
53
+ %indvars.iv = phi i64 [ 0, %13 ], [ %indvars.iv.next, %53 ]
54
+ %54 = phi <4 x i64> [ zeroinitializer, %13 ], [ %128, %53 ]
55
+ %55 = or disjoint i64 %indvars.iv, %34, !dbg !26
56
+ %56 = or disjoint i64 %indvars.iv, %36, !dbg !26
57
+ %57 = lshr i64 %55, 7, !dbg !27
58
+ %58 = lshr i64 %56, 7, !dbg !27
59
+ %59 = trunc nuw nsw i64 %58 to i32, !dbg !27
60
+ %60 = or i32 %59, 4, !dbg !27
61
+ %61 = or disjoint i64 %57, 8, !dbg !27
62
+ %62 = or i32 %59, 12, !dbg !27
63
+ %63 = zext nneg i32 %60 to i64, !dbg !20
64
+ %64 = zext nneg i32 %62 to i64, !dbg !20
65
+ %65 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28
66
+ %66 = sub nsw i64 %35, %57, !dbg !29
67
+ %67 = sub nsw i32 %26, %60, !dbg !29
68
+ %68 = sub nsw i64 %35, %61, !dbg !29
69
+ %69 = sub nsw i32 %26, %62, !dbg !29
70
+ %70 = sext i32 %67 to i64, !dbg !30
71
+ %71 = sext i32 %69 to i64, !dbg !30
72
+ %72 = insertelement <4 x i64> poison, i64 %57, i64 0, !dbg !20
73
+ %73 = insertelement <4 x i64> %72, i64 %63, i64 1, !dbg !20
74
+ %74 = insertelement <4 x i64> %73, i64 %61, i64 2, !dbg !20
75
+ %75 = insertelement <4 x i64> %74, i64 %64, i64 3, !dbg !20
76
+ %76 = or disjoint <4 x i64> %42, %75, !dbg !20
77
+ %77 = icmp slt <4 x i64> %76, %44, !dbg !21
78
+ %78 = and <4 x i1> %40, %77, !dbg !19
79
+ %79 = icmp sge <4 x i64> %76, %46, !dbg !22
80
+ %80 = extractelement <4 x i1> %78, i64 0, !dbg !31
81
+ %81 = and i1 %15, %80, !dbg !31
82
+ %82 = extractelement <4 x i1> %78, i64 1, !dbg !31
83
+ %83 = and i1 %15, %82, !dbg !31
84
+ %84 = extractelement <4 x i1> %78, i64 2, !dbg !31
85
+ %85 = and i1 %15, %84, !dbg !31
86
+ %86 = extractelement <4 x i1> %78, i64 3, !dbg !31
87
+ %87 = and i1 %15, %86, !dbg !31
88
+ %88 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %65, i1 %81) #5, !dbg !28
89
+ %89 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28
90
+ %90 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %89, i1 %83) #5, !dbg !28
91
+ %91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28
92
+ %92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %91, i1 %85) #5, !dbg !28
93
+ %93 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28
94
+ %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %93, i1 %87) #5, !dbg !28
95
+ %95 = insertelement <4 x i64> poison, i64 %88, i64 0, !dbg !32
96
+ %96 = insertelement <4 x i64> %95, i64 %90, i64 1, !dbg !32
97
+ %97 = insertelement <4 x i64> %96, i64 %92, i64 2, !dbg !32
98
+ %98 = insertelement <4 x i64> %97, i64 %94, i64 3, !dbg !32
99
+ %99 = icmp slt <4 x i64> %46, %98, !dbg !32
100
+ %100 = icmp slt <4 x i64> %76, %98, !dbg !33
101
+ %101 = and <4 x i1> %99, %100, !dbg !34
102
+ %102 = and <4 x i1> %79, %101, !dbg !35
103
+ %103 = srem i64 %28, %8, !dbg !36
104
+ %.not = icmp eq i64 %103, 0, !dbg !37
105
+ %104 = select i1 %.not, i64 0, i64 %31, !dbg !38
106
+ %105 = add nsw i64 %104, %103, !dbg !38
107
+ %106 = insertelement <4 x i64> poison, i64 %105, i64 0, !dbg !39
108
+ %107 = shufflevector <4 x i64> %106, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !39
109
+ %108 = icmp slt <4 x i64> %107, %98, !dbg !39
110
+ %109 = insertelement <4 x i64> poison, i64 %66, i64 0, !dbg !24
111
+ %110 = insertelement <4 x i64> %109, i64 %70, i64 1, !dbg !24
112
+ %111 = insertelement <4 x i64> %110, i64 %68, i64 2, !dbg !24
113
+ %112 = insertelement <4 x i64> %111, i64 %71, i64 3, !dbg !24
114
+ %113 = add nsw <4 x i64> %50, %112, !dbg !24
115
+ %114 = srem <4 x i64> %113, %52, !dbg !25
116
+ %115 = icmp ne <4 x i64> %114, zeroinitializer, !dbg !40
117
+ %116 = xor <4 x i64> %114, %52, !dbg !41
118
+ %117 = icmp slt <4 x i64> %116, zeroinitializer, !dbg !41
119
+ %118 = and <4 x i1> %115, %117, !dbg !42
120
+ %119 = select <4 x i1> %118, <4 x i64> %52, <4 x i64> zeroinitializer, !dbg !43
121
+ %120 = sub <4 x i64> zeroinitializer, %119, !dbg !44
122
+ %121 = icmp eq <4 x i64> %114, %120, !dbg !44
123
+ %122 = and <4 x i1> %108, %121, !dbg !23
124
+ %123 = and <4 x i1> %48, %122, !dbg !23
125
+ %124 = or <4 x i1> %102, %123, !dbg !45
126
+ %125 = select <4 x i1> %38, <4 x i1> %78, <4 x i1> zeroinitializer, !dbg !18
127
+ %126 = select <4 x i1> %125, <4 x i1> %124, <4 x i1> zeroinitializer, !dbg !18
128
+ %127 = zext <4 x i1> %126 to <4 x i64>, !dbg !18
129
+ %128 = add <4 x i64> %54, %127, !dbg !18
130
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !17
131
+ %129 = icmp samesign ult i64 %indvars.iv, 14336, !dbg !17
132
+ br i1 %129, label %53, label %130, !dbg !17
133
+
134
+ 130: ; preds = %53
135
+ %131 = and i32 %16, 31, !dbg !9
136
+ %132 = lshr i32 %16, 5, !dbg !9
137
+ %133 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %128), !dbg !46
138
+ %extelt.offset = lshr i64 %133, 32, !dbg !50
139
+ %134 = trunc nuw i64 %extelt.offset to i32, !dbg !50
140
+ %135 = trunc i64 %133 to i32, !dbg !50
141
+ %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 16, i32 31), !dbg !50
142
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 16, i32 31), !dbg !50
143
+ %138 = insertelement <2 x i32> poison, i32 %136, i64 0, !dbg !50
144
+ %139 = insertelement <2 x i32> %138, i32 %137, i64 1, !dbg !50
145
+ %140 = bitcast <2 x i32> %139 to i64, !dbg !50
146
+ %141 = add i64 %133, %140, !dbg !46
147
+ %extelt.offset1 = lshr i64 %141, 32, !dbg !50
148
+ %142 = trunc nuw i64 %extelt.offset1 to i32, !dbg !50
149
+ %143 = trunc i64 %141 to i32, !dbg !50
150
+ %144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %143, i32 8, i32 31), !dbg !50
151
+ %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 8, i32 31), !dbg !50
152
+ %146 = insertelement <2 x i32> poison, i32 %144, i64 0, !dbg !50
153
+ %147 = insertelement <2 x i32> %146, i32 %145, i64 1, !dbg !50
154
+ %148 = bitcast <2 x i32> %147 to i64, !dbg !50
155
+ %149 = add i64 %141, %148, !dbg !46
156
+ %extelt.offset2 = lshr i64 %149, 32, !dbg !50
157
+ %150 = trunc nuw i64 %extelt.offset2 to i32, !dbg !50
158
+ %151 = trunc i64 %149 to i32, !dbg !50
159
+ %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 4, i32 31), !dbg !50
160
+ %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 4, i32 31), !dbg !50
161
+ %154 = insertelement <2 x i32> poison, i32 %152, i64 0, !dbg !50
162
+ %155 = insertelement <2 x i32> %154, i32 %153, i64 1, !dbg !50
163
+ %156 = bitcast <2 x i32> %155 to i64, !dbg !50
164
+ %157 = add i64 %149, %156, !dbg !46
165
+ %extelt.offset3 = lshr i64 %157, 32, !dbg !50
166
+ %158 = trunc nuw i64 %extelt.offset3 to i32, !dbg !50
167
+ %159 = trunc i64 %157 to i32, !dbg !50
168
+ %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 2, i32 31), !dbg !50
169
+ %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 2, i32 31), !dbg !50
170
+ %162 = insertelement <2 x i32> poison, i32 %160, i64 0, !dbg !50
171
+ %163 = insertelement <2 x i32> %162, i32 %161, i64 1, !dbg !50
172
+ %164 = bitcast <2 x i32> %163 to i64, !dbg !50
173
+ %165 = add i64 %157, %164, !dbg !46
174
+ %extelt.offset4 = lshr i64 %165, 32, !dbg !50
175
+ %166 = trunc nuw i64 %extelt.offset4 to i32, !dbg !50
176
+ %167 = trunc i64 %165 to i32, !dbg !50
177
+ %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 1, i32 31), !dbg !50
178
+ %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %166, i32 1, i32 31), !dbg !50
179
+ %170 = insertelement <2 x i32> poison, i32 %168, i64 0, !dbg !50
180
+ %171 = insertelement <2 x i32> %170, i32 %169, i64 1, !dbg !50
181
+ %172 = bitcast <2 x i32> %171 to i64, !dbg !50
182
+ %173 = add i64 %165, %172, !dbg !46
183
+ %174 = and i32 %132, 15, !dbg !50
184
+ %175 = icmp eq i32 %131, 0, !dbg !50
185
+ %176 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %174, !dbg !50
186
+ %177 = insertelement <1 x i64> poison, i64 %173, i64 0, !dbg !50
187
+ tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %176, <1 x i64> %177, i1 %175) #5, !dbg !50
188
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50
189
+ %178 = icmp samesign ult i32 %16, 16, !dbg !50
190
+ %179 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %16, !dbg !50
191
+ %180 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %179, i1 %178) #5, !dbg !50
192
+ %extelt.offset5 = lshr i64 %180, 32, !dbg !50
193
+ %181 = trunc nuw i64 %extelt.offset5 to i32, !dbg !50
194
+ %182 = trunc i64 %180 to i32, !dbg !50
195
+ %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 8, i32 31), !dbg !50
196
+ %184 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 8, i32 31), !dbg !50
197
+ %185 = insertelement <2 x i32> poison, i32 %183, i64 0, !dbg !50
198
+ %186 = insertelement <2 x i32> %185, i32 %184, i64 1, !dbg !50
199
+ %187 = bitcast <2 x i32> %186 to i64, !dbg !50
200
+ %188 = add i64 %180, %187, !dbg !46
201
+ %extelt.offset6 = lshr i64 %188, 32, !dbg !50
202
+ %189 = trunc nuw i64 %extelt.offset6 to i32, !dbg !50
203
+ %190 = trunc i64 %188 to i32, !dbg !50
204
+ %191 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %190, i32 4, i32 31), !dbg !50
205
+ %192 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 4, i32 31), !dbg !50
206
+ %193 = insertelement <2 x i32> poison, i32 %191, i64 0, !dbg !50
207
+ %194 = insertelement <2 x i32> %193, i32 %192, i64 1, !dbg !50
208
+ %195 = bitcast <2 x i32> %194 to i64, !dbg !50
209
+ %196 = add i64 %188, %195, !dbg !46
210
+ %extelt.offset7 = lshr i64 %196, 32, !dbg !50
211
+ %197 = trunc nuw i64 %extelt.offset7 to i32, !dbg !50
212
+ %198 = trunc i64 %196 to i32, !dbg !50
213
+ %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 2, i32 31), !dbg !50
214
+ %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 2, i32 31), !dbg !50
215
+ %201 = insertelement <2 x i32> poison, i32 %199, i64 0, !dbg !50
216
+ %202 = insertelement <2 x i32> %201, i32 %200, i64 1, !dbg !50
217
+ %203 = bitcast <2 x i32> %202 to i64, !dbg !50
218
+ %204 = add i64 %196, %203, !dbg !46
219
+ %extelt.offset8 = lshr i64 %204, 32, !dbg !50
220
+ %205 = trunc nuw i64 %extelt.offset8 to i32, !dbg !50
221
+ %206 = trunc i64 %204 to i32, !dbg !50
222
+ %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 1, i32 31), !dbg !50
223
+ %208 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 1, i32 31), !dbg !50
224
+ %209 = insertelement <2 x i32> poison, i32 %207, i64 0, !dbg !50
225
+ %210 = insertelement <2 x i32> %209, i32 %208, i64 1, !dbg !50
226
+ %211 = bitcast <2 x i32> %210 to i64, !dbg !50
227
+ %212 = add i64 %204, %211, !dbg !46
228
+ %213 = icmp eq i32 %16, 0, !dbg !50
229
+ %214 = insertelement <1 x i64> poison, i64 %212, i64 0, !dbg !50
230
+ tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %179, <1 x i64> %214, i1 %213) #5, !dbg !50
231
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50
232
+ %215 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !50
233
+ %216 = add i64 %215, -1, !dbg !51
234
+ %217 = icmp ult i64 %216, 16383, !dbg !51
235
+ %218 = zext i1 %217 to i32, !dbg !52
236
+ %219 = icmp eq i64 %215, 16384, !dbg !53
237
+ %220 = zext i1 %219 to i32, !dbg !52
238
+ %221 = getelementptr i32, ptr addrspace(1) %1, i64 %18, !dbg !54
239
+ %222 = icmp eq i32 %17, 0, !dbg !55
240
+ %223 = and i1 %222, %15, !dbg !55
241
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %218, ptr addrspace(1) %221, i1 %223) #5, !dbg !55
242
+ %224 = getelementptr i32, ptr addrspace(1) %2, i64 %18, !dbg !56
243
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %220, ptr addrspace(1) %224, i1 %223) #5, !dbg !57
244
+ ret void, !dbg !58
245
+ }
246
+
247
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
248
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
249
+
250
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
251
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
252
+
253
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
254
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
255
+
256
+ ; Function Attrs: convergent nocallback nounwind
257
+ declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
258
+
259
+ ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
260
+ declare i64 @llvm.smin.i64(i64, i64) #4
261
+
262
+ ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
263
+ declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4
264
+
265
+ attributes #0 = { nounwind "nvvm.reqntid"="512" }
266
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
267
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
268
+ attributes #3 = { convergent nocallback nounwind }
269
+ attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
270
+ attributes #5 = { nounwind }
271
+
272
+ !llvm.dbg.cu = !{!0}
273
+ !llvm.module.flags = !{!2, !3}
274
+
275
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
276
+ !1 = !DIFile(filename: "cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av")
277
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
278
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
279
+ !4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", linkageName: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
280
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
281
+ !6 = !{}
282
+ !7 = !DILocation(line: 22, column: 28, scope: !4)
283
+ !8 = !DILocation(line: 24, column: 21, scope: !4)
284
+ !9 = !DILocation(line: 25, column: 37, scope: !4)
285
+ !10 = !DILocation(line: 27, column: 21, scope: !4)
286
+ !11 = !DILocation(line: 27, column: 28, scope: !4)
287
+ !12 = !DILocation(line: 28, column: 19, scope: !4)
288
+ !13 = !DILocation(line: 29, column: 19, scope: !4)
289
+ !14 = !DILocation(line: 39, column: 26, scope: !4)
290
+ !15 = !DILocation(line: 42, column: 26, scope: !4)
291
+ !16 = !DILocation(line: 49, column: 35, scope: !4)
292
+ !17 = !DILocation(line: 32, column: 40, scope: !4)
293
+ !18 = !DILocation(line: 86, column: 50, scope: !4)
294
+ !19 = !DILocation(line: 45, column: 22, scope: !4)
295
+ !20 = !DILocation(line: 39, column: 22, scope: !4)
296
+ !21 = !DILocation(line: 41, column: 22, scope: !4)
297
+ !22 = !DILocation(line: 48, column: 23, scope: !4)
298
+ !23 = !DILocation(line: 79, column: 24, scope: !4)
299
+ !24 = !DILocation(line: 69, column: 51, scope: !4)
300
+ !25 = !DILocation(line: 70, column: 25, scope: !4)
301
+ !26 = !DILocation(line: 33, column: 31, scope: !4)
302
+ !27 = !DILocation(line: 37, column: 27, scope: !4)
303
+ !28 = !DILocation(line: 49, column: 77, scope: !4)
304
+ !29 = !DILocation(line: 69, column: 24, scope: !4)
305
+ !30 = !DILocation(line: 69, column: 38, scope: !4)
306
+ !31 = !DILocation(line: 49, column: 94, scope: !4)
307
+ !32 = !DILocation(line: 50, column: 23, scope: !4)
308
+ !33 = !DILocation(line: 51, column: 23, scope: !4)
309
+ !34 = !DILocation(line: 52, column: 24, scope: !4)
310
+ !35 = !DILocation(line: 53, column: 23, scope: !4)
311
+ !36 = !DILocation(line: 58, column: 24, scope: !4)
312
+ !37 = !DILocation(line: 60, column: 25, scope: !4)
313
+ !38 = !DILocation(line: 66, column: 39, scope: !4)
314
+ !39 = !DILocation(line: 67, column: 24, scope: !4)
315
+ !40 = !DILocation(line: 71, column: 25, scope: !4)
316
+ !41 = !DILocation(line: 73, column: 25, scope: !4)
317
+ !42 = !DILocation(line: 74, column: 24, scope: !4)
318
+ !43 = !DILocation(line: 76, column: 39, scope: !4)
319
+ !44 = !DILocation(line: 78, column: 25, scope: !4)
320
+ !45 = !DILocation(line: 80, column: 24, scope: !4)
321
+ !46 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !49)
322
+ !47 = distinct !DILexicalBlockFile(scope: !4, file: !48, discriminator: 0)
323
+ !48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language")
324
+ !49 = !DILocation(line: 87, column: 27, scope: !4)
325
+ !50 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !49)
326
+ !51 = !DILocation(line: 92, column: 20, scope: !4)
327
+ !52 = !DILocation(line: 0, scope: !4)
328
+ !53 = !DILocation(line: 95, column: 21, scope: !4)
329
+ !54 = !DILocation(line: 98, column: 25, scope: !4)
330
+ !55 = !DILocation(line: 98, column: 37, scope: !4)
331
+ !56 = !DILocation(line: 99, column: 25, scope: !4)
332
+ !57 = !DILocation(line: 99, column: 37, scope: !4)
333
+ !58 = !DILocation(line: 99, column: 4, scope: !4)
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx ADDED
@@ -0,0 +1,809 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 // -- Begin function triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1
10
+ .extern .shared .align 16 .b8 global_smem[];
11
+ // @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1
12
+ .visible .entry triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(
13
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0,
14
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1,
15
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2,
16
+ .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3,
17
+ .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4,
18
+ .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5,
19
+ .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6,
20
+ .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7,
21
+ .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8,
22
+ .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9,
23
+ .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_10,
24
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_11,
25
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_12
26
+ )
27
+ .reqntid 512
28
+ {
29
+ .reg .pred %p<89>;
30
+ .reg .b32 %r<77>;
31
+ .reg .b64 %rd<216>;
32
+ .loc 1 18 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:18:0
33
+ $L__func_begin0:
34
+ .loc 1 18 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:18:0
35
+
36
+ // %bb.0:
37
+ ld.param.b64 %rd65, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4];
38
+ $L__tmp0:
39
+ .loc 1 22 28 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:22:28
40
+ mov.u32 %r8, %ctaid.x;
41
+ ld.param.b64 %rd70, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3];
42
+ .loc 1 27 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:27:21
43
+ cvt.u64.u32 %rd1, %r8;
44
+ and.b64 %rd71, %rd70, -4294967296;
45
+ setp.ne.b64 %p21, %rd71, 0;
46
+ cvt.u32.u64 %r75, %rd1;
47
+ @%p21 bra $L__BB0_2;
48
+ bra.uni $L__BB0_1;
49
+ $L__BB0_2:
50
+ div.s64 %rd205, %rd1, %rd70;
51
+ bra.uni $L__BB0_3;
52
+ $L__BB0_1:
53
+ cvt.u32.u64 %r9, %rd70;
54
+ div.u32 %r11, %r75, %r9;
55
+ cvt.u64.u32 %rd205, %r11;
56
+ $L__BB0_3:
57
+ .loc 1 0 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0:21
58
+ mov.u32 %r1, %tid.x;
59
+ ld.param.b64 %rd68, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7];
60
+ .loc 1 27 28 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:27:28
61
+ or.b64 %rd72, %rd205, %rd65;
62
+ and.b64 %rd73, %rd72, -4294967296;
63
+ setp.ne.b64 %p22, %rd73, 0;
64
+ @%p22 bra $L__BB0_5;
65
+ bra.uni $L__BB0_4;
66
+ $L__BB0_5:
67
+ rem.s64 %rd206, %rd205, %rd65;
68
+ bra.uni $L__BB0_6;
69
+ $L__BB0_4:
70
+ cvt.u32.u64 %r12, %rd65;
71
+ cvt.u32.u64 %r13, %rd205;
72
+ rem.u32 %r14, %r13, %r12;
73
+ cvt.u64.u32 %rd206, %r14;
74
+ $L__BB0_6:
75
+ .loc 1 0 28 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0:28
76
+ ld.param.b32 %r7, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9];
77
+ ld.param.b64 %rd69, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8];
78
+ ld.param.b64 %rd67, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6];
79
+ ld.param.b64 %rd62, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0];
80
+ and.b32 %r2, %r1, 511;
81
+ .loc 1 28 19 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:28:19
82
+ mul.lo.s64 %rd74, %rd205, %rd70;
83
+ sub.s64 %rd9, %rd1, %rd74;
84
+ .loc 1 29 19 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:29:19
85
+ and.b64 %rd75, %rd68, -4294967296;
86
+ setp.ne.b64 %p23, %rd75, 0;
87
+ @%p23 bra $L__BB0_8;
88
+ bra.uni $L__BB0_7;
89
+ $L__BB0_8:
90
+ div.s64 %rd207, %rd1, %rd68;
91
+ bra.uni $L__BB0_9;
92
+ $L__BB0_7:
93
+ cvt.u32.u64 %r15, %rd68;
94
+ div.u32 %r17, %r75, %r15;
95
+ cvt.u64.u32 %rd207, %r17;
96
+ $L__BB0_9:
97
+ .loc 1 0 19 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0:19
98
+ ld.param.b64 %rd66, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5];
99
+ ld.param.b64 %rd64, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2];
100
+ ld.param.b64 %rd63, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1];
101
+ .loc 1 24 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:24:21
102
+ setp.lt.s32 %p1, %r75, %r7;
103
+ .loc 1 39 26 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:39:26
104
+ shl.b64 %rd16, %rd206, 7;
105
+ .loc 1 42 26 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:42:26
106
+ shl.b64 %rd81, %rd9, 7;
107
+ .loc 1 49 35 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:35
108
+ shl.b64 %rd82, %rd207, 3;
109
+ add.s64 %rd90, %rd62, %rd82;
110
+ and.b32 %r3, %r1, 127;
111
+ cvt.u64.u32 %rd83, %r3;
112
+ or.b64 %rd24, %rd81, %rd83;
113
+ setp.lt.s64 %p5, %rd24, %rd67;
114
+ setp.ge.s64 %p9, %rd24, %rd69;
115
+ min.s64 %rd15, %rd69, 0;
116
+ sub.s64 %rd84, %rd9, %rd206;
117
+ shl.b64 %rd28, %rd84, 7;
118
+ .loc 1 32 40 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:32:40
119
+ cvt.u64.u32 %rd85, %r2;
120
+ cvt.u64.u32 %rd86, %r1;
121
+ shr.u64 %rd87, %rd86, 7;
122
+ cvt.u32.u64 %r76, %rd87;
123
+ shr.u64 %rd209, %rd85, 7;
124
+ sub.s64 %rd208, %rd83, %rd209;
125
+ mov.b64 %rd211, 0;
126
+ mov.b64 %rd210, -2048;
127
+ mov.b64 %rd212, %rd211;
128
+ mov.b64 %rd213, %rd211;
129
+ mov.b64 %rd214, %rd211;
130
+ bra.uni $L__BB0_10;
131
+ $L__BB0_12: // in Loop: Header=BB0_10 Depth=1
132
+ .loc 1 58 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:58:24
133
+ rem.s64 %rd215, %rd24, %rd69;
134
+ $L__BB0_13: // in Loop: Header=BB0_10 Depth=1
135
+ .loc 1 0 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0
136
+ sub.s32 %r22, %r3, %r20;
137
+ add.s64 %rd45, %rd208, -8;
138
+ sub.s32 %r23, %r3, %r21;
139
+ cvt.s64.s32 %rd46, %r22;
140
+ cvt.s64.s32 %rd47, %r23;
141
+ .loc 1 60 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:60:25
142
+ setp.eq.b64 %p42, %rd215, 0;
143
+ .loc 1 66 39 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:66:39
144
+ selp.b64 %rd117, 0, %rd15, %p42;
145
+ add.s64 %rd118, %rd117, %rd215;
146
+ .loc 1 67 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:67:24
147
+ setp.lt.s64 %p43, %rd118, %rd89;
148
+ setp.lt.s64 %p44, %rd118, %rd93;
149
+ setp.lt.s64 %p45, %rd118, %rd97;
150
+ setp.lt.s64 %p46, %rd118, %rd101;
151
+ .loc 1 69 51 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:69:51
152
+ add.s64 %rd119, %rd28, %rd47;
153
+ add.s64 %rd120, %rd28, %rd45;
154
+ add.s64 %rd121, %rd28, %rd46;
155
+ add.s64 %rd122, %rd28, %rd208;
156
+ .loc 1 70 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:70:25
157
+ rem.s64 %rd123, %rd122, %rd69;
158
+ rem.s64 %rd124, %rd121, %rd69;
159
+ rem.s64 %rd125, %rd120, %rd69;
160
+ rem.s64 %rd126, %rd119, %rd69;
161
+ .loc 1 71 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:71:25
162
+ setp.ne.b64 %p47, %rd126, 0;
163
+ setp.ne.b64 %p48, %rd125, 0;
164
+ setp.ne.b64 %p49, %rd124, 0;
165
+ setp.ne.b64 %p50, %rd123, 0;
166
+ .loc 1 73 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:73:25
167
+ xor.b64 %rd127, %rd126, %rd69;
168
+ xor.b64 %rd128, %rd125, %rd69;
169
+ xor.b64 %rd129, %rd124, %rd69;
170
+ xor.b64 %rd130, %rd123, %rd69;
171
+ .loc 1 76 39 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:76:39
172
+ shr.s64 %rd131, %rd130, 63;
173
+ and.b64 %rd132, %rd131, %rd69;
174
+ selp.b64 %rd133, %rd132, 0, %p50;
175
+ shr.s64 %rd134, %rd129, 63;
176
+ and.b64 %rd135, %rd134, %rd69;
177
+ selp.b64 %rd136, %rd135, 0, %p49;
178
+ shr.s64 %rd137, %rd128, 63;
179
+ and.b64 %rd138, %rd137, %rd69;
180
+ selp.b64 %rd139, %rd138, 0, %p48;
181
+ shr.s64 %rd140, %rd127, 63;
182
+ and.b64 %rd141, %rd140, %rd69;
183
+ selp.b64 %rd142, %rd141, 0, %p47;
184
+ .loc 1 78 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:78:25
185
+ neg.s64 %rd143, %rd142;
186
+ neg.s64 %rd144, %rd139;
187
+ neg.s64 %rd145, %rd136;
188
+ neg.s64 %rd146, %rd133;
189
+ setp.eq.b64 %p51, %rd123, %rd146;
190
+ setp.eq.b64 %p52, %rd124, %rd145;
191
+ setp.eq.b64 %p53, %rd125, %rd144;
192
+ setp.eq.b64 %p54, %rd126, %rd143;
193
+ .loc 1 79 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:79:24
194
+ and.pred %p55, %p46, %p54;
195
+ and.pred %p57, %p45, %p53;
196
+ and.pred %p59, %p44, %p52;
197
+ and.pred %p61, %p43, %p51;
198
+ and.pred %p63, %p9, %p61;
199
+ and.pred %p64, %p9, %p59;
200
+ and.pred %p65, %p9, %p57;
201
+ and.pred %p66, %p9, %p55;
202
+ .loc 1 80 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:80:24
203
+ or.pred %p67, %p20, %p66;
204
+ or.pred %p68, %p19, %p65;
205
+ or.pred %p69, %p18, %p64;
206
+ or.pred %p70, %p17, %p63;
207
+ .loc 1 86 50 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:86:50
208
+ and.pred %p75, %p24, %p70;
209
+ and.pred %p76, %p25, %p69;
210
+ and.pred %p77, %p26, %p68;
211
+ and.pred %p78, %p27, %p67;
212
+ selp.b64 %rd147, 1, 0, %p78;
213
+ selp.b64 %rd148, 1, 0, %p77;
214
+ selp.b64 %rd149, 1, 0, %p76;
215
+ selp.b64 %rd150, 1, 0, %p75;
216
+ add.s64 %rd211, %rd211, %rd150;
217
+ add.s64 %rd212, %rd212, %rd149;
218
+ add.s64 %rd213, %rd213, %rd148;
219
+ add.s64 %rd214, %rd214, %rd147;
220
+ .loc 1 32 40 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:32:40
221
+ add.s64 %rd210, %rd210, 2048;
222
+ add.s32 %r76, %r76, 16;
223
+ add.s64 %rd209, %rd209, 16;
224
+ add.s64 %rd208, %rd208, -16;
225
+ setp.lt.u64 %p79, %rd210, 14336;
226
+ @%p79 bra $L__BB0_10;
227
+ bra.uni $L__BB0_14;
228
+ $L__BB0_10: // =>This Inner Loop Header: Depth=1
229
+ .loc 1 37 27 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:37:27
230
+ or.b32 %r20, %r76, 4;
231
+ add.s64 %rd104, %rd209, %rd16;
232
+ or.b32 %r21, %r76, 12;
233
+ .loc 1 39 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:39:22
234
+ cvt.u64.u32 %rd105, %r20;
235
+ cvt.u64.u32 %rd106, %r21;
236
+ .loc 1 49 77 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:77
237
+ // begin inline asm
238
+ mov.u64 %rd88, 0x0;
239
+ createpolicy.fractional.L2::evict_last.b64 %rd88, 1.0;
240
+ // end inline asm
241
+ .loc 1 39 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:39:22
242
+ or.b64 %rd107, %rd16, %rd106;
243
+ add.s64 %rd108, %rd104, 8;
244
+ or.b64 %rd109, %rd16, %rd105;
245
+ or.b64 %rd110, %rd16, %rd209;
246
+ .loc 1 41 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:41:22
247
+ setp.lt.s64 %p29, %rd110, %rd66;
248
+ setp.lt.s64 %p30, %rd109, %rd66;
249
+ setp.lt.s64 %p31, %rd108, %rd66;
250
+ setp.lt.s64 %p32, %rd107, %rd66;
251
+ .loc 1 45 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:45:22
252
+ and.pred %p16, %p5, %p32;
253
+ and.pred %p15, %p5, %p31;
254
+ and.pred %p14, %p5, %p30;
255
+ and.pred %p13, %p5, %p29;
256
+ .loc 1 48 23 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:48:23
257
+ setp.ge.s64 %p33, %rd107, %rd24;
258
+ setp.ge.s64 %p34, %rd108, %rd24;
259
+ setp.ge.s64 %p35, %rd109, %rd24;
260
+ setp.ge.s64 %p36, %rd110, %rd24;
261
+ .loc 1 49 94 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:94
262
+ and.pred %p24, %p1, %p13;
263
+ and.pred %p25, %p1, %p14;
264
+ and.pred %p26, %p1, %p15;
265
+ and.pred %p27, %p1, %p16;
266
+ .loc 1 49 77 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:77
267
+ // begin inline asm
268
+ mov.u64 %rd89, 0x0;
269
+ @%p24 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd89 }, [ %rd90 + 0 ], %rd88;
270
+ // end inline asm
271
+ // begin inline asm
272
+ mov.u64 %rd92, 0x0;
273
+ createpolicy.fractional.L2::evict_last.b64 %rd92, 1.0;
274
+ // end inline asm
275
+ // begin inline asm
276
+ mov.u64 %rd93, 0x0;
277
+ @%p25 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd93 }, [ %rd90 + 0 ], %rd92;
278
+ // end inline asm
279
+ // begin inline asm
280
+ mov.u64 %rd96, 0x0;
281
+ createpolicy.fractional.L2::evict_last.b64 %rd96, 1.0;
282
+ // end inline asm
283
+ // begin inline asm
284
+ mov.u64 %rd97, 0x0;
285
+ @%p26 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd97 }, [ %rd90 + 0 ], %rd96;
286
+ // end inline asm
287
+ // begin inline asm
288
+ mov.u64 %rd100, 0x0;
289
+ createpolicy.fractional.L2::evict_last.b64 %rd100, 1.0;
290
+ // end inline asm
291
+ // begin inline asm
292
+ mov.u64 %rd101, 0x0;
293
+ @%p27 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd101 }, [ %rd90 + 0 ], %rd100;
294
+ // end inline asm
295
+ .loc 1 52 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:52:24
296
+ max.s64 %rd111, %rd24, %rd107;
297
+ setp.lt.s64 %p37, %rd111, %rd101;
298
+ max.s64 %rd112, %rd24, %rd108;
299
+ setp.lt.s64 %p38, %rd112, %rd97;
300
+ max.s64 %rd113, %rd24, %rd109;
301
+ setp.lt.s64 %p39, %rd113, %rd93;
302
+ max.s64 %rd114, %rd24, %rd110;
303
+ setp.lt.s64 %p40, %rd114, %rd89;
304
+ .loc 1 53 23 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:53:23
305
+ and.pred %p17, %p36, %p40;
306
+ and.pred %p18, %p35, %p39;
307
+ and.pred %p19, %p34, %p38;
308
+ and.pred %p20, %p33, %p37;
309
+ .loc 1 58 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:58:24
310
+ or.b64 %rd115, %rd24, %rd69;
311
+ and.b64 %rd116, %rd115, -4294967296;
312
+ setp.ne.b64 %p41, %rd116, 0;
313
+ @%p41 bra $L__BB0_12;
314
+ // %bb.11: // in Loop: Header=BB0_10 Depth=1
315
+ cvt.u32.u64 %r24, %rd69;
316
+ cvt.u32.u64 %r25, %rd24;
317
+ rem.u32 %r26, %r25, %r24;
318
+ cvt.u64.u32 %rd215, %r26;
319
+ bra.uni $L__BB0_13;
320
+ $L__BB0_14:
321
+ .loc 1 25 37 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:25:37
322
+ and.b32 %r33, %r1, 31;
323
+ $L__tmp1:
324
+ .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
325
+ add.s64 %rd156, %rd211, %rd213;
326
+ add.s64 %rd157, %rd212, %rd214;
327
+ add.s64 %rd158, %rd156, %rd157;
328
+ .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
329
+ mov.b64 {_, %r34}, %rd158;
330
+ cvt.u32.u64 %r35, %rd158;
331
+ shfl.sync.bfly.b32 %r36, %r35, 16, 31, -1;
332
+ shfl.sync.bfly.b32 %r37, %r34, 16, 31, -1;
333
+ cvt.u64.u32 %rd159, %r36;
334
+ cvt.u64.u32 %rd160, %r37;
335
+ shl.b64 %rd161, %rd160, 32;
336
+ or.b64 %rd162, %rd159, %rd161;
337
+ .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
338
+ add.s64 %rd163, %rd158, %rd162;
339
+ .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
340
+ mov.b64 {_, %r38}, %rd163;
341
+ cvt.u32.u64 %r39, %rd163;
342
+ shfl.sync.bfly.b32 %r40, %r39, 8, 31, -1;
343
+ shfl.sync.bfly.b32 %r41, %r38, 8, 31, -1;
344
+ cvt.u64.u32 %rd164, %r40;
345
+ cvt.u64.u32 %rd165, %r41;
346
+ shl.b64 %rd166, %rd165, 32;
347
+ or.b64 %rd167, %rd164, %rd166;
348
+ .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
349
+ add.s64 %rd168, %rd163, %rd167;
350
+ .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
351
+ mov.b64 {_, %r42}, %rd168;
352
+ cvt.u32.u64 %r43, %rd168;
353
+ shfl.sync.bfly.b32 %r44, %r43, 4, 31, -1;
354
+ shfl.sync.bfly.b32 %r45, %r42, 4, 31, -1;
355
+ cvt.u64.u32 %rd169, %r44;
356
+ cvt.u64.u32 %rd170, %r45;
357
+ shl.b64 %rd171, %rd170, 32;
358
+ or.b64 %rd172, %rd169, %rd171;
359
+ .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
360
+ add.s64 %rd173, %rd168, %rd172;
361
+ .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
362
+ mov.b64 {_, %r46}, %rd173;
363
+ cvt.u32.u64 %r47, %rd173;
364
+ shfl.sync.bfly.b32 %r48, %r47, 2, 31, -1;
365
+ shfl.sync.bfly.b32 %r49, %r46, 2, 31, -1;
366
+ cvt.u64.u32 %rd174, %r48;
367
+ cvt.u64.u32 %rd175, %r49;
368
+ shl.b64 %rd176, %rd175, 32;
369
+ or.b64 %rd177, %rd174, %rd176;
370
+ .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
371
+ add.s64 %rd178, %rd173, %rd177;
372
+ .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
373
+ mov.b64 {_, %r50}, %rd178;
374
+ cvt.u32.u64 %r51, %rd178;
375
+ shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1;
376
+ shfl.sync.bfly.b32 %r53, %r50, 1, 31, -1;
377
+ cvt.u64.u32 %rd179, %r52;
378
+ cvt.u64.u32 %rd180, %r53;
379
+ shl.b64 %rd181, %rd180, 32;
380
+ or.b64 %rd182, %rd179, %rd181;
381
+ .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
382
+ add.s64 %rd151, %rd178, %rd182;
383
+ .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
384
+ setp.eq.b32 %p80, %r33, 0;
385
+ shr.u32 %r54, %r1, 2;
386
+ and.b32 %r55, %r54, 120;
387
+ mov.b32 %r56, global_smem;
388
+ add.s32 %r27, %r56, %r55;
389
+ // begin inline asm
390
+ @%p80 st.shared.b64 [ %r27 + 0 ], %rd151;
391
+ // end inline asm
392
+ bar.sync 0;
393
+ setp.lt.u32 %p81, %r1, 16;
394
+ shl.b32 %r57, %r1, 3;
395
+ add.s32 %r28, %r56, %r57;
396
+ // begin inline asm
397
+ @%p81 ld.shared.b64 %rd152, [ %r28 + 0 ];
398
+ // end inline asm
399
+ mov.b64 {_, %r58}, %rd152;
400
+ cvt.u32.u64 %r59, %rd152;
401
+ shfl.sync.bfly.b32 %r60, %r59, 8, 31, -1;
402
+ shfl.sync.bfly.b32 %r61, %r58, 8, 31, -1;
403
+ cvt.u64.u32 %rd183, %r60;
404
+ cvt.u64.u32 %rd184, %r61;
405
+ shl.b64 %rd185, %rd184, 32;
406
+ or.b64 %rd186, %rd183, %rd185;
407
+ .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
408
+ add.s64 %rd187, %rd152, %rd186;
409
+ .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
410
+ mov.b64 {_, %r62}, %rd187;
411
+ cvt.u32.u64 %r63, %rd187;
412
+ shfl.sync.bfly.b32 %r64, %r63, 4, 31, -1;
413
+ shfl.sync.bfly.b32 %r65, %r62, 4, 31, -1;
414
+ cvt.u64.u32 %rd188, %r64;
415
+ cvt.u64.u32 %rd189, %r65;
416
+ shl.b64 %rd190, %rd189, 32;
417
+ or.b64 %rd191, %rd188, %rd190;
418
+ .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
419
+ add.s64 %rd192, %rd187, %rd191;
420
+ .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
421
+ mov.b64 {_, %r66}, %rd192;
422
+ cvt.u32.u64 %r67, %rd192;
423
+ shfl.sync.bfly.b32 %r68, %r67, 2, 31, -1;
424
+ shfl.sync.bfly.b32 %r69, %r66, 2, 31, -1;
425
+ cvt.u64.u32 %rd193, %r68;
426
+ cvt.u64.u32 %rd194, %r69;
427
+ shl.b64 %rd195, %rd194, 32;
428
+ or.b64 %rd196, %rd193, %rd195;
429
+ .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
430
+ add.s64 %rd197, %rd192, %rd196;
431
+ .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
432
+ mov.b64 {_, %r70}, %rd197;
433
+ cvt.u32.u64 %r71, %rd197;
434
+ shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1;
435
+ shfl.sync.bfly.b32 %r73, %r70, 1, 31, -1;
436
+ cvt.u64.u32 %rd198, %r72;
437
+ cvt.u64.u32 %rd199, %r73;
438
+ shl.b64 %rd200, %rd199, 32;
439
+ or.b64 %rd201, %rd198, %rd200;
440
+ .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
441
+ add.s64 %rd153, %rd197, %rd201;
442
+ .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
443
+ setp.eq.b32 %p82, %r1, 0;
444
+ // begin inline asm
445
+ @%p82 st.shared.b64 [ %r28 + 0 ], %rd153;
446
+ // end inline asm
447
+ bar.sync 0;
448
+ ld.shared.b64 %rd202, [global_smem];
449
+ $L__tmp2:
450
+ .loc 1 92 20 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:92:20
451
+ add.s64 %rd203, %rd202, -1;
452
+ setp.lt.u64 %p86, %rd203, 16383;
453
+ .loc 1 0 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0
454
+ selp.b32 %r30, 1, 0, %p86;
455
+ .loc 1 95 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:95:21
456
+ setp.eq.b64 %p87, %rd202, 16384;
457
+ .loc 1 0 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0
458
+ selp.b32 %r31, 1, 0, %p87;
459
+ .loc 1 98 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:98:25
460
+ shl.b64 %rd204, %rd1, 2;
461
+ add.s64 %rd154, %rd63, %rd204;
462
+ .loc 1 98 37 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:98:37
463
+ setp.eq.b32 %p88, %r2, 0;
464
+ and.pred %p83, %p88, %p1;
465
+ // begin inline asm
466
+ @%p83 st.global.b32 [ %rd154 + 0 ], { %r30 };
467
+ // end inline asm
468
+ .loc 1 99 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:99:25
469
+ add.s64 %rd155, %rd64, %rd204;
470
+ .loc 1 99 37 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:99:37
471
+ // begin inline asm
472
+ @%p83 st.global.b32 [ %rd155 + 0 ], { %r31 };
473
+ // end inline asm
474
+ .loc 1 99 4 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:99:4
475
+ ret;
476
+ $L__tmp3:
477
+ $L__func_end0:
478
+ // -- End function
479
+ }
480
+ .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py"
481
+ .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py"
482
+ .section .debug_abbrev
483
+ {
484
+ .b8 1 // Abbreviation Code
485
+ .b8 17 // DW_TAG_compile_unit
486
+ .b8 1 // DW_CHILDREN_yes
487
+ .b8 37 // DW_AT_producer
488
+ .b8 8 // DW_FORM_string
489
+ .b8 19 // DW_AT_language
490
+ .b8 5 // DW_FORM_data2
491
+ .b8 3 // DW_AT_name
492
+ .b8 8 // DW_FORM_string
493
+ .b8 16 // DW_AT_stmt_list
494
+ .b8 6 // DW_FORM_data4
495
+ .b8 27 // DW_AT_comp_dir
496
+ .b8 8 // DW_FORM_string
497
+ .b8 0 // EOM(1)
498
+ .b8 0 // EOM(2)
499
+ .b8 2 // Abbreviation Code
500
+ .b8 46 // DW_TAG_subprogram
501
+ .b8 0 // DW_CHILDREN_no
502
+ .b8 3 // DW_AT_name
503
+ .b8 8 // DW_FORM_string
504
+ .b8 32 // DW_AT_inline
505
+ .b8 11 // DW_FORM_data1
506
+ .b8 0 // EOM(1)
507
+ .b8 0 // EOM(2)
508
+ .b8 3 // Abbreviation Code
509
+ .b8 46 // DW_TAG_subprogram
510
+ .b8 1 // DW_CHILDREN_yes
511
+ .b8 17 // DW_AT_low_pc
512
+ .b8 1 // DW_FORM_addr
513
+ .b8 18 // DW_AT_high_pc
514
+ .b8 1 // DW_FORM_addr
515
+ .b8 49 // DW_AT_abstract_origin
516
+ .b8 19 // DW_FORM_ref4
517
+ .b8 0 // EOM(1)
518
+ .b8 0 // EOM(2)
519
+ .b8 4 // Abbreviation Code
520
+ .b8 29 // DW_TAG_inlined_subroutine
521
+ .b8 0 // DW_CHILDREN_no
522
+ .b8 49 // DW_AT_abstract_origin
523
+ .b8 19 // DW_FORM_ref4
524
+ .b8 17 // DW_AT_low_pc
525
+ .b8 1 // DW_FORM_addr
526
+ .b8 18 // DW_AT_high_pc
527
+ .b8 1 // DW_FORM_addr
528
+ .b8 88 // DW_AT_call_file
529
+ .b8 11 // DW_FORM_data1
530
+ .b8 89 // DW_AT_call_line
531
+ .b8 11 // DW_FORM_data1
532
+ .b8 87 // DW_AT_call_column
533
+ .b8 11 // DW_FORM_data1
534
+ .b8 0 // EOM(1)
535
+ .b8 0 // EOM(2)
536
+ .b8 0 // EOM(3)
537
+ }
538
+ .section .debug_info
539
+ {
540
+ .b32 307 // Length of Unit
541
+ .b8 2 // DWARF version number
542
+ .b8 0
543
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
544
+ .b8 8 // Address Size (in bytes)
545
+ .b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit
546
+ .b8 116 // DW_AT_producer
547
+ .b8 114
548
+ .b8 105
549
+ .b8 116
550
+ .b8 111
551
+ .b8 110
552
+ .b8 0
553
+ .b8 2 // DW_AT_language
554
+ .b8 0
555
+ .b8 99 // DW_AT_name
556
+ .b8 97
557
+ .b8 118
558
+ .b8 112
559
+ .b8 55
560
+ .b8 120
561
+ .b8 97
562
+ .b8 110
563
+ .b8 55
564
+ .b8 55
565
+ .b8 116
566
+ .b8 102
567
+ .b8 114
568
+ .b8 55
569
+ .b8 113
570
+ .b8 121
571
+ .b8 116
572
+ .b8 102
573
+ .b8 107
574
+ .b8 112
575
+ .b8 54
576
+ .b8 115
577
+ .b8 106
578
+ .b8 114
579
+ .b8 103
580
+ .b8 107
581
+ .b8 100
582
+ .b8 54
583
+ .b8 104
584
+ .b8 118
585
+ .b8 114
586
+ .b8 117
587
+ .b8 105
588
+ .b8 97
589
+ .b8 113
590
+ .b8 102
591
+ .b8 122
592
+ .b8 107
593
+ .b8 101
594
+ .b8 105
595
+ .b8 98
596
+ .b8 116
597
+ .b8 108
598
+ .b8 53
599
+ .b8 114
600
+ .b8 116
601
+ .b8 97
602
+ .b8 103
603
+ .b8 115
604
+ .b8 99
605
+ .b8 110
606
+ .b8 103
607
+ .b8 46
608
+ .b8 112
609
+ .b8 121
610
+ .b8 0
611
+ .b32 .debug_line // DW_AT_stmt_list
612
+ .b8 47 // DW_AT_comp_dir
613
+ .b8 119
614
+ .b8 111
615
+ .b8 114
616
+ .b8 107
617
+ .b8 115
618
+ .b8 112
619
+ .b8 97
620
+ .b8 99
621
+ .b8 101
622
+ .b8 47
623
+ .b8 104
624
+ .b8 97
625
+ .b8 110
626
+ .b8 114
627
+ .b8 117
628
+ .b8 105
629
+ .b8 47
630
+ .b8 83
631
+ .b8 112
632
+ .b8 101
633
+ .b8 99
634
+ .b8 70
635
+ .b8 111
636
+ .b8 114
637
+ .b8 103
638
+ .b8 101
639
+ .b8 45
640
+ .b8 101
641
+ .b8 120
642
+ .b8 116
643
+ .b8 47
644
+ .b8 99
645
+ .b8 97
646
+ .b8 99
647
+ .b8 104
648
+ .b8 101
649
+ .b8 47
650
+ .b8 99
651
+ .b8 111
652
+ .b8 109
653
+ .b8 112
654
+ .b8 105
655
+ .b8 108
656
+ .b8 101
657
+ .b8 100
658
+ .b8 95
659
+ .b8 107
660
+ .b8 101
661
+ .b8 114
662
+ .b8 110
663
+ .b8 101
664
+ .b8 108
665
+ .b8 115
666
+ .b8 47
667
+ .b8 97
668
+ .b8 118
669
+ .b8 0
670
+ .b8 2 // Abbrev [2] 0x8b:0x7d DW_TAG_subprogram
671
+ .b8 116 // DW_AT_name
672
+ .b8 114
673
+ .b8 105
674
+ .b8 116
675
+ .b8 111
676
+ .b8 110
677
+ .b8 95
678
+ .b8 114
679
+ .b8 101
680
+ .b8 100
681
+ .b8 95
682
+ .b8 102
683
+ .b8 117
684
+ .b8 115
685
+ .b8 101
686
+ .b8 100
687
+ .b8 95
688
+ .b8 95
689
+ .b8 116
690
+ .b8 111
691
+ .b8 95
692
+ .b8 99
693
+ .b8 111
694
+ .b8 112
695
+ .b8 121
696
+ .b8 95
697
+ .b8 97
698
+ .b8 114
699
+ .b8 97
700
+ .b8 110
701
+ .b8 103
702
+ .b8 101
703
+ .b8 95
704
+ .b8 98
705
+ .b8 105
706
+ .b8 116
707
+ .b8 119
708
+ .b8 105
709
+ .b8 115
710
+ .b8 101
711
+ .b8 95
712
+ .b8 97
713
+ .b8 110
714
+ .b8 100
715
+ .b8 95
716
+ .b8 98
717
+ .b8 105
718
+ .b8 116
719
+ .b8 119
720
+ .b8 105
721
+ .b8 115
722
+ .b8 101
723
+ .b8 95
724
+ .b8 111
725
+ .b8 114
726
+ .b8 95
727
+ .b8 99
728
+ .b8 111
729
+ .b8 110
730
+ .b8 115
731
+ .b8 116
732
+ .b8 97
733
+ .b8 110
734
+ .b8 116
735
+ .b8 95
736
+ .b8 112
737
+ .b8 97
738
+ .b8 100
739
+ .b8 95
740
+ .b8 110
741
+ .b8 100
742
+ .b8 95
743
+ .b8 101
744
+ .b8 113
745
+ .b8 95
746
+ .b8 103
747
+ .b8 101
748
+ .b8 95
749
+ .b8 103
750
+ .b8 116
751
+ .b8 95
752
+ .b8 105
753
+ .b8 110
754
+ .b8 100
755
+ .b8 101
756
+ .b8 120
757
+ .b8 95
758
+ .b8 108
759
+ .b8 116
760
+ .b8 95
761
+ .b8 112
762
+ .b8 101
763
+ .b8 114
764
+ .b8 109
765
+ .b8 117
766
+ .b8 116
767
+ .b8 101
768
+ .b8 95
769
+ .b8 114
770
+ .b8 101
771
+ .b8 109
772
+ .b8 97
773
+ .b8 105
774
+ .b8 110
775
+ .b8 100
776
+ .b8 101
777
+ .b8 114
778
+ .b8 95
779
+ .b8 115
780
+ .b8 117
781
+ .b8 98
782
+ .b8 95
783
+ .b8 115
784
+ .b8 117
785
+ .b8 109
786
+ .b8 95
787
+ .b8 118
788
+ .b8 105
789
+ .b8 101
790
+ .b8 119
791
+ .b8 95
792
+ .b8 49
793
+ .b8 0
794
+ .b8 1 // DW_AT_inline
795
+ .b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram
796
+ .b64 $L__func_begin0 // DW_AT_low_pc
797
+ .b64 $L__func_end0 // DW_AT_high_pc
798
+ .b32 139 // DW_AT_abstract_origin
799
+ .b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine
800
+ .b32 139 // DW_AT_abstract_origin
801
+ .b64 $L__tmp1 // DW_AT_low_pc
802
+ .b64 $L__tmp2 // DW_AT_high_pc
803
+ .b8 1 // DW_AT_call_file
804
+ .b8 87 // DW_AT_call_line
805
+ .b8 27 // DW_AT_call_column
806
+ .b8 0 // End Of Children Mark
807
+ .b8 0 // End Of Children Mark
808
+ }
809
+ .section .debug_macinfo { }
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":18:0)
2
+ #loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
3
+ #loc99 = loc(unknown)
4
+ #loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
5
+ #loc106 = loc("in_ptr0"(#loc))
6
+ #loc107 = loc("out_ptr1"(#loc))
7
+ #loc108 = loc("out_ptr2"(#loc))
8
+ #loc109 = loc("ks0"(#loc))
9
+ #loc110 = loc("ks1"(#loc))
10
+ #loc111 = loc("ks2"(#loc))
11
+ #loc112 = loc("ks3"(#loc))
12
+ #loc113 = loc("ks4"(#loc))
13
+ #loc114 = loc("ks5"(#loc))
14
+ #loc115 = loc("xnumel"(#loc))
15
+ #loc116 = loc("r0_numel"(#loc))
16
+ #loc207 = loc("input"(#loc97))
17
+ #loc208 = loc("a"(#loc102))
18
+ #loc209 = loc("b"(#loc102))
19
+ module {
20
+ tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
21
+ %r0_numel_0 = arith.constant 16384 : i32 loc(#loc117)
22
+ %xoffset = tt.get_program_id x : i32 loc(#loc118)
23
+ %xoffset_1 = arith.constant 1 : i32 loc(#loc119)
24
+ %xoffset_2 = arith.constant 1 : i32 loc(#loc119)
25
+ %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc119)
26
+ %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc120)
27
+ %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc121)
28
+ %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc122)
29
+ %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc122)
30
+ %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc123)
31
+ %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc123)
32
+ %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc124)
33
+ %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc125)
34
+ %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc126)
35
+ %x1_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc126)
36
+ %x1_10 = arith.divsi %x1, %x1_9 : tensor<1x1xi64> loc(#loc126)
37
+ %x1_11 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc127)
38
+ %x1_12 = arith.remsi %x1_10, %x1_11 : tensor<1x1xi64> loc(#loc127)
39
+ %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc128)
40
+ %x0_13 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc128)
41
+ %x0_14 = arith.remsi %x0, %x0_13 : tensor<1x1xi64> loc(#loc128)
42
+ %x2 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc129)
43
+ %x2_15 = tt.splat %ks4 : i64 -> tensor<1x1xi64> loc(#loc129)
44
+ %x2_16 = arith.divsi %x2, %x2_15 : tensor<1x1xi64> loc(#loc129)
45
+ %_tmp46 = arith.constant 0 : i64 loc(#loc130)
46
+ %_tmp46_17 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc130)
47
+ %c0_i32 = arith.constant 0 : i32 loc(#loc15)
48
+ %c2048_i32 = arith.constant 2048 : i32 loc(#loc15)
49
+ %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
50
+ %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc15)
51
+ %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc15)
52
+ %3 = ub.poison : i32 loc(#loc15)
53
+ %_tmp46_18 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp46_22 = %_tmp46_17) -> (tensor<1x2048xi64>) : i32 {
54
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc132)
55
+ %r0_index_23 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc132)
56
+ %r0_mask = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc133)
57
+ %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x2048xi32> loc(#loc133)
58
+ %r0_4 = arith.constant 128 : i32 loc(#loc134)
59
+ %r0_4_25 = arith.constant 128 : i32 loc(#loc134)
60
+ %r0_4_26 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc134)
61
+ %r0_4_27 = arith.divsi %r0_index_23, %r0_4_26 : tensor<1x2048xi32> loc(#loc134)
62
+ %r0_3 = arith.constant 128 : i32 loc(#loc135)
63
+ %r0_3_28 = arith.constant 128 : i32 loc(#loc135)
64
+ %r0_3_29 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc135)
65
+ %r0_3_30 = arith.remsi %r0_index_23, %r0_3_29 : tensor<1x2048xi32> loc(#loc135)
66
+ %tmp0 = arith.constant 128 : i32 loc(#loc136)
67
+ %tmp0_31 = arith.constant 128 : i64 loc(#loc136)
68
+ %tmp0_32 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc136)
69
+ %tmp0_33 = arith.muli %tmp0_32, %x1_12 : tensor<1x1xi64> loc(#loc136)
70
+ %tmp0_34 = arith.extsi %r0_4_27 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc137)
71
+ %tmp0_35 = tt.broadcast %tmp0_33 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc137)
72
+ %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<1x2048xi64> loc(#loc137)
73
+ %tmp2 = tt.splat %ks2 : i64 -> tensor<1x2048xi64> loc(#loc138)
74
+ %tmp2_37 = arith.cmpi slt, %tmp0_36, %tmp2 : tensor<1x2048xi64> loc(#loc138)
75
+ %tmp3 = arith.constant 128 : i32 loc(#loc139)
76
+ %tmp3_38 = arith.constant 128 : i64 loc(#loc139)
77
+ %tmp3_39 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc139)
78
+ %tmp3_40 = arith.muli %tmp3_39, %x0_14 : tensor<1x1xi64> loc(#loc139)
79
+ %tmp3_41 = arith.extsi %r0_3_30 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc140)
80
+ %tmp3_42 = tt.broadcast %tmp3_40 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc140)
81
+ %tmp3_43 = arith.addi %tmp3_41, %tmp3_42 : tensor<1x2048xi64> loc(#loc140)
82
+ %tmp5 = tt.splat %ks3 : i64 -> tensor<1x2048xi64> loc(#loc141)
83
+ %tmp5_44 = arith.cmpi slt, %tmp3_43, %tmp5 : tensor<1x2048xi64> loc(#loc141)
84
+ %tmp6 = arith.andi %tmp2_37, %tmp5_44 : tensor<1x2048xi1> loc(#loc142)
85
+ %tmp7 = arith.constant 128 : i32 loc(#loc143)
86
+ %tmp7_45 = arith.constant 128 : i64 loc(#loc143)
87
+ %tmp7_46 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc143)
88
+ %tmp7_47 = arith.muli %tmp7_46, %x1_12 : tensor<1x1xi64> loc(#loc143)
89
+ %tmp7_48 = arith.extsi %r0_4_27 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc144)
90
+ %tmp7_49 = tt.broadcast %tmp7_47 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc144)
91
+ %tmp7_50 = arith.addi %tmp7_48, %tmp7_49 : tensor<1x2048xi64> loc(#loc144)
92
+ %tmp8 = arith.constant 128 : i32 loc(#loc145)
93
+ %tmp8_51 = arith.constant 128 : i64 loc(#loc145)
94
+ %tmp8_52 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc145)
95
+ %tmp8_53 = arith.muli %tmp8_52, %x0_14 : tensor<1x1xi64> loc(#loc145)
96
+ %tmp8_54 = arith.extsi %r0_3_30 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc146)
97
+ %tmp8_55 = tt.broadcast %tmp8_53 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc146)
98
+ %tmp8_56 = arith.addi %tmp8_54, %tmp8_55 : tensor<1x2048xi64> loc(#loc146)
99
+ %tmp9 = arith.cmpi sge, %tmp7_50, %tmp8_56 : tensor<1x2048xi64> loc(#loc147)
100
+ %tmp10 = tt.broadcast %x2_16 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc148)
101
+ %tmp10_57 = tt.splat %in_ptr0 : !tt.ptr<i64> -> tensor<1x2048x!tt.ptr<i64>> loc(#loc149)
102
+ %tmp10_58 = tt.addptr %tmp10_57, %tmp10 : tensor<1x2048x!tt.ptr<i64>>, tensor<1x2048xi64> loc(#loc149)
103
+ %tmp10_59 = arith.andi %r0_mask_24, %tmp6 : tensor<1x2048xi1> loc(#loc150)
104
+ %tmp10_60 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc151)
105
+ %tmp10_61 = arith.andi %tmp10_59, %tmp10_60 : tensor<1x2048xi1> loc(#loc151)
106
+ %tmp10_62 = arith.constant 0.000000e+00 : f32 loc(#loc152)
107
+ %tmp10_63 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc152)
108
+ %tmp10_64 = arith.fptosi %tmp10_63 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc152)
109
+ %tmp10_65 = tt.load %tmp10_58, %tmp10_61, %tmp10_64 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<i64>> loc(#loc152)
110
+ %tmp11 = arith.cmpi slt, %tmp8_56, %tmp10_65 : tensor<1x2048xi64> loc(#loc153)
111
+ %tmp12 = arith.cmpi slt, %tmp7_50, %tmp10_65 : tensor<1x2048xi64> loc(#loc154)
112
+ %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x2048xi1> loc(#loc155)
113
+ %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x2048xi1> loc(#loc156)
114
+ %tmp15 = arith.constant false loc(#loc157)
115
+ %tmp15_66 = arith.constant dense<false> : tensor<1x1xi1> loc(#loc157)
116
+ %tmp16 = arith.constant dense<false> : tensor<1x2048xi1> loc(#loc158)
117
+ %tmp16_67 = arith.ori %tmp16, %tmp14 : tensor<1x2048xi1> loc(#loc158)
118
+ %tmp17 = tt.splat %ks5 : i64 -> tensor<1x2048xi64> loc(#loc159)
119
+ %tmp18 = arith.cmpi sge, %tmp8_56, %tmp17 : tensor<1x2048xi64> loc(#loc160)
120
+ %tmp19 = arith.remsi %tmp8_56, %tmp17 : tensor<1x2048xi64> loc(#loc161)
121
+ %tmp20 = arith.constant 0 : i32 loc(#loc162)
122
+ %tmp20_68 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc162)
123
+ %tmp21 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc163)
124
+ %tmp21_69 = tt.broadcast %tmp21 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc163)
125
+ %tmp21_70 = arith.cmpi ne, %tmp19, %tmp21_69 : tensor<1x2048xi64> loc(#loc163)
126
+ %tmp22 = arith.constant 0 : i32 loc(#loc164)
127
+ %tmp22_71 = arith.extsi %tmp22 : i32 to i64 loc(#loc164)
128
+ %tmp22_72 = tt.splat %tmp22_71 : i64 -> tensor<1x2048xi64> loc(#loc164)
129
+ %tmp22_73 = arith.cmpi slt, %tmp19, %tmp22_72 : tensor<1x2048xi64> loc(#loc164)
130
+ %tmp23 = arith.constant 0 : i32 loc(#loc165)
131
+ %tmp23_74 = arith.extsi %tmp23 : i32 to i64 loc(#loc165)
132
+ %tmp23_75 = tt.splat %tmp23_74 : i64 -> tensor<1x2048xi64> loc(#loc165)
133
+ %tmp23_76 = arith.cmpi slt, %tmp17, %tmp23_75 : tensor<1x2048xi64> loc(#loc165)
134
+ %tmp24 = arith.cmpi ne, %tmp22_73, %tmp23_76 : tensor<1x2048xi1> loc(#loc166)
135
+ %tmp25 = arith.andi %tmp21_70, %tmp24 : tensor<1x2048xi1> loc(#loc167)
136
+ %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x2048xi64> loc(#loc168)
137
+ %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc169)
138
+ %tmp28 = arith.cmpi slt, %tmp27, %tmp10_65 : tensor<1x2048xi64> loc(#loc170)
139
+ %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x2048xi1> loc(#loc171)
140
+ %tmp30 = arith.constant -1 : i32 loc(#loc172)
141
+ %tmp30_77 = arith.constant -1 : i32 loc(#loc172)
142
+ %tmp30_78 = arith.constant dense<-1> : tensor<1x2048xi32> loc(#loc172)
143
+ %tmp30_79 = arith.muli %tmp30_78, %r0_4_27 : tensor<1x2048xi32> loc(#loc172)
144
+ %tmp30_80 = arith.addi %r0_3_30, %tmp30_79 : tensor<1x2048xi32> loc(#loc173)
145
+ %tmp30_81 = arith.constant -128 : i32 loc(#loc174)
146
+ %tmp30_82 = arith.constant -128 : i64 loc(#loc174)
147
+ %tmp30_83 = arith.constant dense<-128> : tensor<1x1xi64> loc(#loc174)
148
+ %tmp30_84 = arith.muli %tmp30_83, %x1_12 : tensor<1x1xi64> loc(#loc174)
149
+ %tmp30_85 = arith.extsi %tmp30_80 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc175)
150
+ %tmp30_86 = tt.broadcast %tmp30_84 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc175)
151
+ %tmp30_87 = arith.addi %tmp30_85, %tmp30_86 : tensor<1x2048xi64> loc(#loc175)
152
+ %tmp30_88 = arith.constant 128 : i32 loc(#loc176)
153
+ %tmp30_89 = arith.constant 128 : i64 loc(#loc176)
154
+ %tmp30_90 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc176)
155
+ %tmp30_91 = arith.muli %tmp30_90, %x0_14 : tensor<1x1xi64> loc(#loc176)
156
+ %tmp30_92 = tt.broadcast %tmp30_91 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc177)
157
+ %tmp30_93 = arith.addi %tmp30_87, %tmp30_92 : tensor<1x2048xi64> loc(#loc177)
158
+ %tmp31 = arith.remsi %tmp30_93, %tmp17 : tensor<1x2048xi64> loc(#loc178)
159
+ %tmp32 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc179)
160
+ %tmp32_94 = tt.broadcast %tmp32 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc179)
161
+ %tmp32_95 = arith.cmpi ne, %tmp31, %tmp32_94 : tensor<1x2048xi64> loc(#loc179)
162
+ %tmp33 = arith.constant 0 : i32 loc(#loc180)
163
+ %tmp33_96 = arith.extsi %tmp33 : i32 to i64 loc(#loc180)
164
+ %tmp33_97 = tt.splat %tmp33_96 : i64 -> tensor<1x2048xi64> loc(#loc180)
165
+ %tmp33_98 = arith.cmpi slt, %tmp31, %tmp33_97 : tensor<1x2048xi64> loc(#loc180)
166
+ %tmp34 = arith.cmpi ne, %tmp33_98, %tmp23_76 : tensor<1x2048xi1> loc(#loc181)
167
+ %tmp35 = arith.andi %tmp32_95, %tmp34 : tensor<1x2048xi1> loc(#loc182)
168
+ %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x2048xi64> loc(#loc183)
169
+ %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc184)
170
+ %tmp38 = arith.constant 0 : i64 loc(#loc185)
171
+ %tmp38_99 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc185)
172
+ %tmp39 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc186)
173
+ %tmp39_100 = arith.cmpi eq, %tmp37, %tmp39 : tensor<1x2048xi64> loc(#loc186)
174
+ %tmp40 = arith.andi %tmp29, %tmp39_100 : tensor<1x2048xi1> loc(#loc187)
175
+ %tmp41 = arith.ori %tmp16_67, %tmp40 : tensor<1x2048xi1> loc(#loc188)
176
+ %tmp42 = arith.constant false loc(#loc189)
177
+ %tmp42_101 = arith.constant dense<false> : tensor<1x2048xi1> loc(#loc189)
178
+ %tmp43 = arith.select %tmp6, %tmp41, %tmp42_101 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc190)
179
+ %tmp44 = arith.extui %tmp43 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc191)
180
+ %tmp47 = arith.addi %_tmp46_22, %tmp44 : tensor<1x2048xi64> loc(#loc192)
181
+ %_tmp46_102 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc193)
182
+ %_tmp46_103 = arith.andi %r0_mask_24, %_tmp46_102 : tensor<1x2048xi1> loc(#loc193)
183
+ %_tmp46_104 = arith.select %_tmp46_103, %tmp47, %_tmp46_22 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc194)
184
+ scf.yield %_tmp46_104 : tensor<1x2048xi64> loc(#loc79)
185
+ } loc(#loc131)
186
+ %tmp46 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp46_18) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc195)
187
+ %tmp46_19 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc196)
188
+ %tmp48 = arith.constant 0 : i64 loc(#loc197)
189
+ %tmp48_20 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc197)
190
+ %tmp49 = arith.cmpi sgt, %tmp46_19, %tmp48_20 : tensor<1x1xi64> loc(#loc198)
191
+ %tmp50 = arith.constant 16384 : i64 loc(#loc199)
192
+ %tmp50_21 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc199)
193
+ %tmp51 = arith.cmpi slt, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc200)
194
+ %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc201)
195
+ %tmp53 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc202)
196
+ %tmp54 = arith.extsi %tmp53 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc203)
197
+ %tmp55 = arith.cmpi eq, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc204)
198
+ %tmp56 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc205)
199
+ %tmp57 = arith.extsi %tmp56 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc206)
200
+ %4 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc92)
201
+ %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr<i32>>, tensor<1x1xi32> loc(#loc92)
202
+ tt.store %5, %tmp54, %xmask_7 : tensor<1x1x!tt.ptr<i32>> loc(#loc93)
203
+ %6 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc94)
204
+ %7 = tt.addptr %6, %xindex_6 : tensor<1x1x!tt.ptr<i32>>, tensor<1x1xi32> loc(#loc94)
205
+ tt.store %7, %tmp57, %xmask_7 : tensor<1x1x!tt.ptr<i32>> loc(#loc95)
206
+ tt.return loc(#loc96)
207
+ } loc(#loc)
208
+ tt.func private @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xi64> loc("input"(#loc97))) -> tensor<1xi64> attributes {noinline = false} {
209
+ %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
210
+ ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)):
211
+ %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc98)
212
+ tt.reduce.return %2 : i64 loc(#loc98)
213
+ }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc98)
214
+ tt.return %0 : tensor<1xi64> loc(#loc100)
215
+ ^bb1: // no predecessors
216
+ %1 = ub.poison : tensor<1xi64> loc(#loc101)
217
+ tt.return %1 : tensor<1xi64> loc(#loc101)
218
+ } loc(#loc97)
219
+ tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc102)), %b: i64 loc("b"(#loc102))) -> i64 attributes {noinline = false} {
220
+ %0 = arith.addi %a, %b : i64 loc(#loc103)
221
+ tt.return %0 : i64 loc(#loc104)
222
+ ^bb1: // no predecessors
223
+ %1 = ub.poison : i64 loc(#loc105)
224
+ tt.return %1 : i64 loc(#loc105)
225
+ } loc(#loc102)
226
+ } loc(#loc)
227
+ #loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":19:15)
228
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:28)
229
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:33)
230
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":23:36)
231
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":23:44)
232
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":23:23)
233
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":24:21)
234
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:27)
235
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:37)
236
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:21)
237
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:28)
238
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":28:19)
239
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":29:19)
240
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":30:44)
241
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":32:40)
242
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":33:31)
243
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":34:29)
244
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":37:27)
245
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":38:27)
246
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:26)
247
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:22)
248
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":41:22)
249
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:26)
250
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:22)
251
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":44:22)
252
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":45:22)
253
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":46:26)
254
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":46:22)
255
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":47:26)
256
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":47:22)
257
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":48:23)
258
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:55)
259
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:35)
260
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:87)
261
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:94)
262
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:77)
263
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":50:23)
264
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":51:23)
265
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":52:24)
266
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":53:23)
267
+ #loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":54:39)
268
+ #loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":55:24)
269
+ #loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":56:37)
270
+ #loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":57:24)
271
+ #loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":58:24)
272
+ #loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":59:35)
273
+ #loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":60:25)
274
+ #loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":61:92)
275
+ #loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":62:92)
276
+ #loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":63:25)
277
+ #loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":64:24)
278
+ #loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":65:24)
279
+ #loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":66:39)
280
+ #loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":67:24)
281
+ #loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":68:24)
282
+ #loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:29)
283
+ #loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:24)
284
+ #loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:45)
285
+ #loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:38)
286
+ #loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:55)
287
+ #loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:51)
288
+ #loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":70:25)
289
+ #loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":71:25)
290
+ #loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":72:92)
291
+ #loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":73:25)
292
+ #loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":74:24)
293
+ #loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":75:24)
294
+ #loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":76:39)
295
+ #loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":77:35)
296
+ #loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":78:25)
297
+ #loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":79:24)
298
+ #loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":80:24)
299
+ #loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":81:44)
300
+ #loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":82:38)
301
+ #loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":83:25)
302
+ #loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":85:25)
303
+ #loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:36)
304
+ #loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:50)
305
+ #loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:8)
306
+ #loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:27)
307
+ #loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:30)
308
+ #loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":88:31)
309
+ #loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":89:20)
310
+ #loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":90:35)
311
+ #loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":91:20)
312
+ #loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":92:20)
313
+ #loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":93:21)
314
+ #loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":94:21)
315
+ #loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":95:21)
316
+ #loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":96:21)
317
+ #loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":97:21)
318
+ #loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:25)
319
+ #loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:37)
320
+ #loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:25)
321
+ #loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:37)
322
+ #loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:4)
323
+ #loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
324
+ #loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
325
+ #loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
326
+ #loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
327
+ #loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
328
+ #loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
329
+ #loc117 = loc("r0_numel"(#loc1))
330
+ #loc118 = loc("xoffset"(#loc2))
331
+ #loc119 = loc("xoffset"(#loc3))
332
+ #loc120 = loc("xindex"(#loc4))
333
+ #loc121 = loc("xindex"(#loc5))
334
+ #loc122 = loc("xindex"(#loc6))
335
+ #loc123 = loc("xmask"(#loc7))
336
+ #loc124 = loc("r0_base"(#loc8))
337
+ #loc125 = loc("r0_base"(#loc9))
338
+ #loc126 = loc("x1"(#loc10))
339
+ #loc127 = loc("x1"(#loc11))
340
+ #loc128 = loc("x0"(#loc12))
341
+ #loc129 = loc("x2"(#loc13))
342
+ #loc130 = loc("_tmp46"(#loc14))
343
+ #loc131 = loc("_tmp46"(#loc15))
344
+ #loc132 = loc("r0_index"(#loc16))
345
+ #loc133 = loc("r0_mask"(#loc17))
346
+ #loc134 = loc("r0_4"(#loc18))
347
+ #loc135 = loc("r0_3"(#loc19))
348
+ #loc136 = loc("tmp0"(#loc20))
349
+ #loc137 = loc("tmp0"(#loc21))
350
+ #loc138 = loc("tmp2"(#loc22))
351
+ #loc139 = loc("tmp3"(#loc23))
352
+ #loc140 = loc("tmp3"(#loc24))
353
+ #loc141 = loc("tmp5"(#loc25))
354
+ #loc142 = loc("tmp6"(#loc26))
355
+ #loc143 = loc("tmp7"(#loc27))
356
+ #loc144 = loc("tmp7"(#loc28))
357
+ #loc145 = loc("tmp8"(#loc29))
358
+ #loc146 = loc("tmp8"(#loc30))
359
+ #loc147 = loc("tmp9"(#loc31))
360
+ #loc148 = loc("tmp10"(#loc32))
361
+ #loc149 = loc("tmp10"(#loc33))
362
+ #loc150 = loc("tmp10"(#loc34))
363
+ #loc151 = loc("tmp10"(#loc35))
364
+ #loc152 = loc("tmp10"(#loc36))
365
+ #loc153 = loc("tmp11"(#loc37))
366
+ #loc154 = loc("tmp12"(#loc38))
367
+ #loc155 = loc("tmp13"(#loc39))
368
+ #loc156 = loc("tmp14"(#loc40))
369
+ #loc157 = loc("tmp15"(#loc41))
370
+ #loc158 = loc("tmp16"(#loc42))
371
+ #loc159 = loc("tmp17"(#loc43))
372
+ #loc160 = loc("tmp18"(#loc44))
373
+ #loc161 = loc("tmp19"(#loc45))
374
+ #loc162 = loc("tmp20"(#loc46))
375
+ #loc163 = loc("tmp21"(#loc47))
376
+ #loc164 = loc("tmp22"(#loc48))
377
+ #loc165 = loc("tmp23"(#loc49))
378
+ #loc166 = loc("tmp24"(#loc50))
379
+ #loc167 = loc("tmp25"(#loc51))
380
+ #loc168 = loc("tmp26"(#loc52))
381
+ #loc169 = loc("tmp27"(#loc53))
382
+ #loc170 = loc("tmp28"(#loc54))
383
+ #loc171 = loc("tmp29"(#loc55))
384
+ #loc172 = loc("tmp30"(#loc56))
385
+ #loc173 = loc("tmp30"(#loc57))
386
+ #loc174 = loc("tmp30"(#loc58))
387
+ #loc175 = loc("tmp30"(#loc59))
388
+ #loc176 = loc("tmp30"(#loc60))
389
+ #loc177 = loc("tmp30"(#loc61))
390
+ #loc178 = loc("tmp31"(#loc62))
391
+ #loc179 = loc("tmp32"(#loc63))
392
+ #loc180 = loc("tmp33"(#loc64))
393
+ #loc181 = loc("tmp34"(#loc65))
394
+ #loc182 = loc("tmp35"(#loc66))
395
+ #loc183 = loc("tmp36"(#loc67))
396
+ #loc184 = loc("tmp37"(#loc68))
397
+ #loc185 = loc("tmp38"(#loc69))
398
+ #loc186 = loc("tmp39"(#loc70))
399
+ #loc187 = loc("tmp40"(#loc71))
400
+ #loc188 = loc("tmp41"(#loc72))
401
+ #loc189 = loc("tmp42"(#loc73))
402
+ #loc190 = loc("tmp43"(#loc74))
403
+ #loc191 = loc("tmp44"(#loc75))
404
+ #loc192 = loc("tmp47"(#loc76))
405
+ #loc193 = loc("_tmp46"(#loc77))
406
+ #loc194 = loc("_tmp46"(#loc78))
407
+ #loc195 = loc("tmp46"(#loc80))
408
+ #loc196 = loc("tmp46"(#loc81))
409
+ #loc197 = loc("tmp48"(#loc82))
410
+ #loc198 = loc("tmp49"(#loc83))
411
+ #loc199 = loc("tmp50"(#loc84))
412
+ #loc200 = loc("tmp51"(#loc85))
413
+ #loc201 = loc("tmp52"(#loc86))
414
+ #loc202 = loc("tmp53"(#loc87))
415
+ #loc203 = loc("tmp54"(#loc88))
416
+ #loc204 = loc("tmp55"(#loc89))
417
+ #loc205 = loc("tmp56"(#loc90))
418
+ #loc206 = loc("tmp57"(#loc91))
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}>
2
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":18:0)
3
+ #loc1 = loc(unknown)
4
+ #loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:27)
5
+ #loc79 = loc("in_ptr0"(#loc))
6
+ #loc80 = loc("out_ptr1"(#loc))
7
+ #loc81 = loc("out_ptr2"(#loc))
8
+ #loc82 = loc("ks0"(#loc))
9
+ #loc83 = loc("ks1"(#loc))
10
+ #loc84 = loc("ks2"(#loc))
11
+ #loc85 = loc("ks3"(#loc))
12
+ #loc86 = loc("ks4"(#loc))
13
+ #loc87 = loc("ks5"(#loc))
14
+ #loc88 = loc("xnumel"(#loc))
15
+ #loc89 = loc("r0_numel"(#loc))
16
+ #loc149 = loc("tmp46"(#loc63))
17
+ #loc164 = loc(callsite(#loc1 at #loc149))
18
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
19
+ tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
20
+ %cst = arith.constant dense<128> : tensor<1x2048xi32, #blocked> loc(#loc1)
21
+ %cst_0 = arith.constant dense<16384> : tensor<1x2048xi32, #blocked> loc(#loc1)
22
+ %c-128_i64 = arith.constant -128 : i64 loc(#loc1)
23
+ %c0_i64 = arith.constant 0 : i64 loc(#loc1)
24
+ %c128_i64 = arith.constant 128 : i64 loc(#loc1)
25
+ %c2048_i32 = arith.constant 2048 : i32 loc(#loc1)
26
+ %c16384_i32 = arith.constant 16384 : i32 loc(#loc1)
27
+ %c0_i32 = arith.constant 0 : i32 loc(#loc1)
28
+ %cst_1 = arith.constant dense<16384> : tensor<1x1xi64, #blocked> loc(#loc1)
29
+ %cst_2 = arith.constant dense<0> : tensor<1x1xi64, #blocked> loc(#loc1)
30
+ %cst_3 = arith.constant dense<false> : tensor<1x2048xi1, #blocked> loc(#loc1)
31
+ %cst_4 = arith.constant dense<0> : tensor<1x2048xi64, #blocked> loc(#loc1)
32
+ %xoffset = tt.get_program_id x : i32 loc(#loc90)
33
+ %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc91)
34
+ %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc92)
35
+ %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc92)
36
+ %x1 = arith.extsi %xoffset : i32 to i64 loc(#loc93)
37
+ %x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc93)
38
+ %x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc94)
39
+ %x0 = arith.remsi %x1, %ks0 : i64 loc(#loc95)
40
+ %x2 = arith.divsi %x1, %ks4 : i64 loc(#loc96)
41
+ %tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc97)
42
+ %tmp0_8 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc159)
43
+ %tmp2 = tt.splat %ks2 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc99)
44
+ %tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc100)
45
+ %tmp3_9 = tt.splat %tmp3 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc160)
46
+ %tmp5 = tt.splat %ks3 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc102)
47
+ %tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr<i64>, i64 loc(#loc103)
48
+ %tmp10_10 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc161)
49
+ %tmp10_11 = tt.splat %tmp10 : !tt.ptr<i64> -> tensor<1x2048x!tt.ptr<i64>, #blocked> loc(#loc105)
50
+ %tmp17 = tt.splat %ks5 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc106)
51
+ %tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc107)
52
+ %tmp23_12 = tt.splat %tmp23 : i1 -> tensor<1x2048xi1, #blocked> loc(#loc107)
53
+ %tmp30 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc108)
54
+ %tmp30_13 = tt.splat %tmp30 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc162)
55
+ %_tmp46 = scf.for %_tmp46_15 = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%arg12 = %cst_4) -> (tensor<1x2048xi64, #blocked>) : i32 {
56
+ %r0_index = tt.splat %_tmp46_15 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc111)
57
+ %r0_index_16 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32, #blocked> loc(#loc111)
58
+ %r0_mask = arith.cmpi slt, %r0_index_16, %cst_0 : tensor<1x2048xi32, #blocked> loc(#loc112)
59
+ %r0_4 = arith.divsi %r0_index_16, %cst : tensor<1x2048xi32, #blocked> loc(#loc113)
60
+ %r0_3 = arith.remsi %r0_index_16, %cst : tensor<1x2048xi32, #blocked> loc(#loc114)
61
+ %tmp0_17 = arith.extsi %r0_4 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc98)
62
+ %tmp0_18 = arith.addi %tmp0_17, %tmp0_8 : tensor<1x2048xi64, #blocked> loc(#loc98)
63
+ %tmp2_19 = arith.cmpi slt, %tmp0_18, %tmp2 : tensor<1x2048xi64, #blocked> loc(#loc99)
64
+ %tmp3_20 = arith.extsi %r0_3 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc101)
65
+ %tmp3_21 = arith.addi %tmp3_20, %tmp3_9 : tensor<1x2048xi64, #blocked> loc(#loc101)
66
+ %tmp5_22 = arith.cmpi slt, %tmp3_21, %tmp5 : tensor<1x2048xi64, #blocked> loc(#loc102)
67
+ %tmp6 = arith.andi %tmp2_19, %tmp5_22 : tensor<1x2048xi1, #blocked> loc(#loc115)
68
+ %tmp9 = arith.cmpi sge, %tmp0_18, %tmp3_21 : tensor<1x2048xi64, #blocked> loc(#loc116)
69
+ %tmp10_23 = arith.andi %r0_mask, %tmp6 : tensor<1x2048xi1, #blocked> loc(#loc117)
70
+ %tmp10_24 = arith.andi %tmp10_23, %tmp10_10 : tensor<1x2048xi1, #blocked> loc(#loc104)
71
+ %tmp10_25 = tt.load %tmp10_11, %tmp10_24, %cst_4 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<i64>, #blocked> loc(#loc105)
72
+ %tmp11 = arith.cmpi slt, %tmp3_21, %tmp10_25 : tensor<1x2048xi64, #blocked> loc(#loc118)
73
+ %tmp12 = arith.cmpi slt, %tmp0_18, %tmp10_25 : tensor<1x2048xi64, #blocked> loc(#loc119)
74
+ %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x2048xi1, #blocked> loc(#loc120)
75
+ %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x2048xi1, #blocked> loc(#loc121)
76
+ %tmp18 = arith.cmpi sge, %tmp3_21, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc122)
77
+ %tmp19 = arith.remsi %tmp3_21, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc123)
78
+ %tmp21 = arith.cmpi ne, %tmp19, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc124)
79
+ %tmp22 = arith.cmpi slt, %tmp19, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc125)
80
+ %tmp24 = arith.cmpi ne, %tmp22, %tmp23_12 : tensor<1x2048xi1, #blocked> loc(#loc126)
81
+ %tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x2048xi1, #blocked> loc(#loc127)
82
+ %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc128)
83
+ %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc129)
84
+ %tmp28 = arith.cmpi slt, %tmp27, %tmp10_25 : tensor<1x2048xi64, #blocked> loc(#loc130)
85
+ %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x2048xi1, #blocked> loc(#loc131)
86
+ %tmp30_26 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32, #blocked> loc(#loc132)
87
+ %tmp30_27 = arith.extsi %tmp30_26 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc109)
88
+ %tmp30_28 = arith.addi %tmp30_27, %tmp30_13 : tensor<1x2048xi64, #blocked> loc(#loc109)
89
+ %tmp30_29 = arith.addi %tmp30_28, %tmp3_9 : tensor<1x2048xi64, #blocked> loc(#loc133)
90
+ %tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc134)
91
+ %tmp32 = arith.cmpi ne, %tmp31, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc135)
92
+ %tmp33 = arith.cmpi slt, %tmp31, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc136)
93
+ %tmp34 = arith.cmpi ne, %tmp33, %tmp23_12 : tensor<1x2048xi1, #blocked> loc(#loc137)
94
+ %tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x2048xi1, #blocked> loc(#loc138)
95
+ %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc139)
96
+ %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc140)
97
+ %tmp39 = arith.cmpi eq, %tmp37, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc141)
98
+ %tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x2048xi1, #blocked> loc(#loc142)
99
+ %tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x2048xi1, #blocked> loc(#loc143)
100
+ %tmp43 = arith.select %tmp6, %tmp41, %cst_3 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi1, #blocked> loc(#loc144)
101
+ %tmp44 = arith.extui %tmp43 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc145)
102
+ %tmp47 = arith.addi %arg12, %tmp44 : tensor<1x2048xi64, #blocked> loc(#loc146)
103
+ %_tmp46_30 = arith.andi %r0_mask, %tmp10_10 : tensor<1x2048xi1, #blocked> loc(#loc147)
104
+ %_tmp46_31 = arith.select %_tmp46_30, %tmp47, %arg12 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc148)
105
+ scf.yield %_tmp46_31 : tensor<1x2048xi64, #blocked> loc(#loc61)
106
+ } loc(#loc110)
107
+ %tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({
108
+ ^bb0(%tmp46_15: i64 loc(callsite(#loc1 at #loc149)), %tmp46_16: i64 loc(callsite(#loc1 at #loc149))):
109
+ %tmp46_17 = arith.addi %tmp46_15, %tmp46_16 : i64 loc(#loc167)
110
+ tt.reduce.return %tmp46_17 : i64 loc(#loc163)
111
+ }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc163)
112
+ %tmp46_14 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc150)
113
+ %tmp49 = arith.cmpi sgt, %tmp46_14, %cst_2 : tensor<1x1xi64, #blocked> loc(#loc151)
114
+ %tmp51 = arith.cmpi slt, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc152)
115
+ %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1, #blocked> loc(#loc153)
116
+ %tmp54 = arith.extui %tmp52 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc165)
117
+ %tmp55 = arith.cmpi eq, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc156)
118
+ %tmp57 = arith.extui %tmp55 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc166)
119
+ %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr<i32>, i32 loc(#loc74)
120
+ %1 = tt.splat %0 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc75)
121
+ %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc75)
122
+ tt.store %1, %tmp54, %2 : tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc75)
123
+ %3 = tt.addptr %out_ptr2, %xoffset : !tt.ptr<i32>, i32 loc(#loc76)
124
+ %4 = tt.splat %3 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc77)
125
+ tt.store %4, %tmp57, %2 : tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc77)
126
+ tt.return loc(#loc78)
127
+ } loc(#loc)
128
+ } loc(#loc)
129
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:28)
130
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":24:21)
131
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:37)
132
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:21)
133
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:28)
134
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":28:19)
135
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":29:19)
136
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:26)
137
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:22)
138
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":41:22)
139
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:26)
140
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:22)
141
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":44:22)
142
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:35)
143
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:94)
144
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:77)
145
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":56:37)
146
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":62:92)
147
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:45)
148
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:38)
149
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":32:40)
150
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":33:31)
151
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":34:29)
152
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":37:27)
153
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":38:27)
154
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":45:22)
155
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":48:23)
156
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:87)
157
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":50:23)
158
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":51:23)
159
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":52:24)
160
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":53:23)
161
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":57:24)
162
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":58:24)
163
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":60:25)
164
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":61:92)
165
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":63:25)
166
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":64:24)
167
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":65:24)
168
+ #loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":66:39)
169
+ #loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":67:24)
170
+ #loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":68:24)
171
+ #loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:24)
172
+ #loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:51)
173
+ #loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":70:25)
174
+ #loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":71:25)
175
+ #loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":72:92)
176
+ #loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":73:25)
177
+ #loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":74:24)
178
+ #loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":75:24)
179
+ #loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":76:39)
180
+ #loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":78:25)
181
+ #loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":79:24)
182
+ #loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":80:24)
183
+ #loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":82:38)
184
+ #loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":83:25)
185
+ #loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":85:25)
186
+ #loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:36)
187
+ #loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:50)
188
+ #loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:8)
189
+ #loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
190
+ #loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
191
+ #loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:30)
192
+ #loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":89:20)
193
+ #loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":91:20)
194
+ #loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":92:20)
195
+ #loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":94:21)
196
+ #loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":93:21)
197
+ #loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":95:21)
198
+ #loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":97:21)
199
+ #loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":96:21)
200
+ #loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:25)
201
+ #loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:37)
202
+ #loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:25)
203
+ #loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:37)
204
+ #loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:4)
205
+ #loc90 = loc("xoffset"(#loc2))
206
+ #loc91 = loc("xmask"(#loc3))
207
+ #loc92 = loc("r0_base"(#loc4))
208
+ #loc93 = loc("x1"(#loc5))
209
+ #loc94 = loc("x1"(#loc6))
210
+ #loc95 = loc("x0"(#loc7))
211
+ #loc96 = loc("x2"(#loc8))
212
+ #loc97 = loc("tmp0"(#loc9))
213
+ #loc98 = loc("tmp0"(#loc10))
214
+ #loc99 = loc("tmp2"(#loc11))
215
+ #loc100 = loc("tmp3"(#loc12))
216
+ #loc101 = loc("tmp3"(#loc13))
217
+ #loc102 = loc("tmp5"(#loc14))
218
+ #loc103 = loc("tmp10"(#loc15))
219
+ #loc104 = loc("tmp10"(#loc16))
220
+ #loc105 = loc("tmp10"(#loc17))
221
+ #loc106 = loc("tmp17"(#loc18))
222
+ #loc107 = loc("tmp23"(#loc19))
223
+ #loc108 = loc("tmp30"(#loc20))
224
+ #loc109 = loc("tmp30"(#loc21))
225
+ #loc110 = loc("_tmp46"(#loc22))
226
+ #loc111 = loc("r0_index"(#loc23))
227
+ #loc112 = loc("r0_mask"(#loc24))
228
+ #loc113 = loc("r0_4"(#loc25))
229
+ #loc114 = loc("r0_3"(#loc26))
230
+ #loc115 = loc("tmp6"(#loc27))
231
+ #loc116 = loc("tmp9"(#loc28))
232
+ #loc117 = loc("tmp10"(#loc29))
233
+ #loc118 = loc("tmp11"(#loc30))
234
+ #loc119 = loc("tmp12"(#loc31))
235
+ #loc120 = loc("tmp13"(#loc32))
236
+ #loc121 = loc("tmp14"(#loc33))
237
+ #loc122 = loc("tmp18"(#loc34))
238
+ #loc123 = loc("tmp19"(#loc35))
239
+ #loc124 = loc("tmp21"(#loc36))
240
+ #loc125 = loc("tmp22"(#loc37))
241
+ #loc126 = loc("tmp24"(#loc38))
242
+ #loc127 = loc("tmp25"(#loc39))
243
+ #loc128 = loc("tmp26"(#loc40))
244
+ #loc129 = loc("tmp27"(#loc41))
245
+ #loc130 = loc("tmp28"(#loc42))
246
+ #loc131 = loc("tmp29"(#loc43))
247
+ #loc132 = loc("tmp30"(#loc44))
248
+ #loc133 = loc("tmp30"(#loc45))
249
+ #loc134 = loc("tmp31"(#loc46))
250
+ #loc135 = loc("tmp32"(#loc47))
251
+ #loc136 = loc("tmp33"(#loc48))
252
+ #loc137 = loc("tmp34"(#loc49))
253
+ #loc138 = loc("tmp35"(#loc50))
254
+ #loc139 = loc("tmp36"(#loc51))
255
+ #loc140 = loc("tmp37"(#loc52))
256
+ #loc141 = loc("tmp39"(#loc53))
257
+ #loc142 = loc("tmp40"(#loc54))
258
+ #loc143 = loc("tmp41"(#loc55))
259
+ #loc144 = loc("tmp43"(#loc56))
260
+ #loc145 = loc("tmp44"(#loc57))
261
+ #loc146 = loc("tmp47"(#loc58))
262
+ #loc147 = loc("_tmp46"(#loc59))
263
+ #loc148 = loc("_tmp46"(#loc60))
264
+ #loc150 = loc("tmp46"(#loc65))
265
+ #loc151 = loc("tmp49"(#loc66))
266
+ #loc152 = loc("tmp51"(#loc67))
267
+ #loc153 = loc("tmp52"(#loc68))
268
+ #loc154 = loc("tmp54"(#loc69))
269
+ #loc155 = loc("tmp53"(#loc70))
270
+ #loc156 = loc("tmp55"(#loc71))
271
+ #loc157 = loc("tmp57"(#loc72))
272
+ #loc158 = loc("tmp56"(#loc73))
273
+ #loc159 = loc(fused[#loc98, #loc97])
274
+ #loc160 = loc(fused[#loc101, #loc100])
275
+ #loc161 = loc(fused[#loc104, #loc91])
276
+ #loc162 = loc(fused[#loc109, #loc108])
277
+ #loc163 = loc(callsite(#loc62 at #loc149))
278
+ #loc165 = loc(fused[#loc154, #loc155])
279
+ #loc166 = loc(fused[#loc157, #loc158])
280
+ #loc167 = loc(callsite(#loc64 at #loc163))
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:27)
4
+ #loc81 = loc("in_ptr0"(#loc))
5
+ #loc82 = loc("out_ptr1"(#loc))
6
+ #loc83 = loc("out_ptr2"(#loc))
7
+ #loc84 = loc("ks0"(#loc))
8
+ #loc85 = loc("ks1"(#loc))
9
+ #loc86 = loc("ks2"(#loc))
10
+ #loc87 = loc("ks3"(#loc))
11
+ #loc88 = loc("ks4"(#loc))
12
+ #loc89 = loc("ks5"(#loc))
13
+ #loc90 = loc("xnumel"(#loc))
14
+ #loc91 = loc("r0_numel"(#loc))
15
+ #loc153 = loc("tmp46"(#loc65))
16
+ #loc168 = loc(callsite(#loc1 at #loc153))
17
+ module {
18
+ tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
19
+ %c-128_i64 = arith.constant -128 : i64 loc(#loc1)
20
+ %c0_i64 = arith.constant 0 : i64 loc(#loc1)
21
+ %c128_i64 = arith.constant 128 : i64 loc(#loc1)
22
+ %c2048_i32 = arith.constant 2048 : i32 loc(#loc2)
23
+ %c16384_i32 = arith.constant 16384 : i32 loc(#loc2)
24
+ %c0_i32 = arith.constant 0 : i32 loc(#loc2)
25
+ %tmp50 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc92)
26
+ %cst = arith.constant dense<0> : tensor<1x1xi64> loc(#loc1)
27
+ %cst_0 = arith.constant dense<false> : tensor<1x2048xi1> loc(#loc1)
28
+ %cst_1 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc1)
29
+ %cst_2 = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc1)
30
+ %cst_3 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc1)
31
+ %xoffset = tt.get_program_id x : i32 loc(#loc93)
32
+ %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc94)
33
+ %xmask_4 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc94)
34
+ %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc95)
35
+ %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc96)
36
+ %x1 = arith.extsi %xoffset : i32 to i64 loc(#loc97)
37
+ %x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc97)
38
+ %x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc98)
39
+ %x0 = arith.remsi %x1, %ks0 : i64 loc(#loc99)
40
+ %x2 = arith.divsi %x1, %ks4 : i64 loc(#loc100)
41
+ %_tmp46 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%_tmp46_9 = %cst_3) -> (tensor<1x2048xi64>) : i32 {
42
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc102)
43
+ %r0_index_10 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32> loc(#loc102)
44
+ %r0_mask = arith.cmpi slt, %r0_index_10, %cst_2 : tensor<1x2048xi32> loc(#loc103)
45
+ %r0_4 = arith.divsi %r0_index_10, %cst_1 : tensor<1x2048xi32> loc(#loc104)
46
+ %r0_3 = arith.remsi %r0_index_10, %cst_1 : tensor<1x2048xi32> loc(#loc105)
47
+ %tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc106)
48
+ %tmp0_11 = arith.extsi %r0_4 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc107)
49
+ %tmp0_12 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64> loc(#loc163)
50
+ %tmp0_13 = arith.addi %tmp0_11, %tmp0_12 : tensor<1x2048xi64> loc(#loc107)
51
+ %tmp2 = tt.splat %ks2 : i64 -> tensor<1x2048xi64> loc(#loc108)
52
+ %tmp2_14 = arith.cmpi slt, %tmp0_13, %tmp2 : tensor<1x2048xi64> loc(#loc108)
53
+ %tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc109)
54
+ %tmp3_15 = arith.extsi %r0_3 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc110)
55
+ %tmp3_16 = tt.splat %tmp3 : i64 -> tensor<1x2048xi64> loc(#loc164)
56
+ %tmp3_17 = arith.addi %tmp3_15, %tmp3_16 : tensor<1x2048xi64> loc(#loc110)
57
+ %tmp5 = tt.splat %ks3 : i64 -> tensor<1x2048xi64> loc(#loc111)
58
+ %tmp5_18 = arith.cmpi slt, %tmp3_17, %tmp5 : tensor<1x2048xi64> loc(#loc111)
59
+ %tmp6 = arith.andi %tmp2_14, %tmp5_18 : tensor<1x2048xi1> loc(#loc112)
60
+ %tmp9 = arith.cmpi sge, %tmp0_13, %tmp3_17 : tensor<1x2048xi64> loc(#loc113)
61
+ %tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr<i64>, i64 loc(#loc114)
62
+ %tmp10_19 = tt.splat %tmp10 : !tt.ptr<i64> -> tensor<1x2048x!tt.ptr<i64>> loc(#loc114)
63
+ %tmp10_20 = arith.andi %r0_mask, %tmp6 : tensor<1x2048xi1> loc(#loc115)
64
+ %tmp10_21 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc165)
65
+ %tmp10_22 = arith.andi %tmp10_20, %tmp10_21 : tensor<1x2048xi1> loc(#loc116)
66
+ %tmp10_23 = tt.load %tmp10_19, %tmp10_22, %cst_3 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<i64>> loc(#loc117)
67
+ %tmp11 = arith.cmpi slt, %tmp3_17, %tmp10_23 : tensor<1x2048xi64> loc(#loc118)
68
+ %tmp12 = arith.cmpi slt, %tmp0_13, %tmp10_23 : tensor<1x2048xi64> loc(#loc119)
69
+ %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x2048xi1> loc(#loc120)
70
+ %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x2048xi1> loc(#loc121)
71
+ %tmp17 = tt.splat %ks5 : i64 -> tensor<1x2048xi64> loc(#loc122)
72
+ %tmp18 = arith.cmpi sge, %tmp3_17, %tmp17 : tensor<1x2048xi64> loc(#loc123)
73
+ %tmp19 = arith.remsi %tmp3_17, %tmp17 : tensor<1x2048xi64> loc(#loc124)
74
+ %tmp21 = arith.cmpi ne, %tmp19, %cst_3 : tensor<1x2048xi64> loc(#loc125)
75
+ %tmp22 = arith.cmpi slt, %tmp19, %cst_3 : tensor<1x2048xi64> loc(#loc126)
76
+ %tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc127)
77
+ %tmp23_24 = tt.splat %tmp23 : i1 -> tensor<1x2048xi1> loc(#loc127)
78
+ %tmp24 = arith.cmpi ne, %tmp22, %tmp23_24 : tensor<1x2048xi1> loc(#loc128)
79
+ %tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x2048xi1> loc(#loc129)
80
+ %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x2048xi64> loc(#loc130)
81
+ %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc131)
82
+ %tmp28 = arith.cmpi slt, %tmp27, %tmp10_23 : tensor<1x2048xi64> loc(#loc132)
83
+ %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x2048xi1> loc(#loc133)
84
+ %tmp30 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32> loc(#loc134)
85
+ %tmp30_25 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc135)
86
+ %tmp30_26 = arith.extsi %tmp30 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc136)
87
+ %tmp30_27 = tt.splat %tmp30_25 : i64 -> tensor<1x2048xi64> loc(#loc166)
88
+ %tmp30_28 = arith.addi %tmp30_26, %tmp30_27 : tensor<1x2048xi64> loc(#loc136)
89
+ %tmp30_29 = arith.addi %tmp30_28, %tmp3_16 : tensor<1x2048xi64> loc(#loc137)
90
+ %tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x2048xi64> loc(#loc138)
91
+ %tmp32 = arith.cmpi ne, %tmp31, %cst_3 : tensor<1x2048xi64> loc(#loc139)
92
+ %tmp33 = arith.cmpi slt, %tmp31, %cst_3 : tensor<1x2048xi64> loc(#loc140)
93
+ %tmp34 = arith.cmpi ne, %tmp33, %tmp23_24 : tensor<1x2048xi1> loc(#loc141)
94
+ %tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x2048xi1> loc(#loc142)
95
+ %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x2048xi64> loc(#loc143)
96
+ %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc144)
97
+ %tmp39 = arith.cmpi eq, %tmp37, %cst_3 : tensor<1x2048xi64> loc(#loc145)
98
+ %tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x2048xi1> loc(#loc146)
99
+ %tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x2048xi1> loc(#loc147)
100
+ %tmp43 = arith.select %tmp6, %tmp41, %cst_0 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc148)
101
+ %tmp44 = arith.extui %tmp43 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc149)
102
+ %tmp47 = arith.addi %_tmp46_9, %tmp44 : tensor<1x2048xi64> loc(#loc150)
103
+ %_tmp46_30 = arith.andi %r0_mask, %tmp10_21 : tensor<1x2048xi1> loc(#loc151)
104
+ %_tmp46_31 = arith.select %_tmp46_30, %tmp47, %_tmp46_9 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc152)
105
+ scf.yield %_tmp46_31 : tensor<1x2048xi64> loc(#loc63)
106
+ } loc(#loc101)
107
+ %tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({
108
+ ^bb0(%tmp46_9: i64 loc(callsite(#loc1 at #loc153)), %tmp46_10: i64 loc(callsite(#loc1 at #loc153))):
109
+ %tmp46_11 = arith.addi %tmp46_9, %tmp46_10 : i64 loc(#loc171)
110
+ tt.reduce.return %tmp46_11 : i64 loc(#loc167)
111
+ }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc167)
112
+ %tmp46_8 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc154)
113
+ %tmp49 = arith.cmpi sgt, %tmp46_8, %cst : tensor<1x1xi64> loc(#loc155)
114
+ %tmp51 = arith.cmpi slt, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc156)
115
+ %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc157)
116
+ %tmp54 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc169)
117
+ %tmp55 = arith.cmpi eq, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc160)
118
+ %tmp57 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc170)
119
+ %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr<i32>, i32 loc(#loc76)
120
+ %1 = tt.splat %0 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc76)
121
+ tt.store %1, %tmp54, %xmask_4 : tensor<1x1x!tt.ptr<i32>> loc(#loc77)
122
+ %2 = tt.addptr %out_ptr2, %xoffset : !tt.ptr<i32>, i32 loc(#loc78)
123
+ %3 = tt.splat %2 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc78)
124
+ tt.store %3, %tmp57, %xmask_4 : tensor<1x1x!tt.ptr<i32>> loc(#loc79)
125
+ tt.return loc(#loc80)
126
+ } loc(#loc)
127
+ } loc(#loc)
128
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":32:40)
129
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":90:35)
130
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:28)
131
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":24:21)
132
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:27)
133
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:37)
134
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:21)
135
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:28)
136
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":28:19)
137
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":29:19)
138
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":33:31)
139
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":34:29)
140
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":37:27)
141
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":38:27)
142
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:26)
143
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:22)
144
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":41:22)
145
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:26)
146
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:22)
147
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":44:22)
148
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":45:22)
149
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":48:23)
150
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:35)
151
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:87)
152
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:94)
153
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:77)
154
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":50:23)
155
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":51:23)
156
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":52:24)
157
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":53:23)
158
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":56:37)
159
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":57:24)
160
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":58:24)
161
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":60:25)
162
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":61:92)
163
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":62:92)
164
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":63:25)
165
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":64:24)
166
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":65:24)
167
+ #loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":66:39)
168
+ #loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":67:24)
169
+ #loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":68:24)
170
+ #loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:24)
171
+ #loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:45)
172
+ #loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:38)
173
+ #loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:51)
174
+ #loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":70:25)
175
+ #loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":71:25)
176
+ #loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":72:92)
177
+ #loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":73:25)
178
+ #loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":74:24)
179
+ #loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":75:24)
180
+ #loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":76:39)
181
+ #loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":78:25)
182
+ #loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":79:24)
183
+ #loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":80:24)
184
+ #loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":82:38)
185
+ #loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":83:25)
186
+ #loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":85:25)
187
+ #loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:36)
188
+ #loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:50)
189
+ #loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:8)
190
+ #loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
191
+ #loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
192
+ #loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:30)
193
+ #loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":89:20)
194
+ #loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":91:20)
195
+ #loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":92:20)
196
+ #loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":94:21)
197
+ #loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":93:21)
198
+ #loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":95:21)
199
+ #loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":97:21)
200
+ #loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":96:21)
201
+ #loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:25)
202
+ #loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:37)
203
+ #loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:25)
204
+ #loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:37)
205
+ #loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:4)
206
+ #loc92 = loc("tmp50"(#loc3))
207
+ #loc93 = loc("xoffset"(#loc4))
208
+ #loc94 = loc("xmask"(#loc5))
209
+ #loc95 = loc("r0_base"(#loc6))
210
+ #loc96 = loc("r0_base"(#loc7))
211
+ #loc97 = loc("x1"(#loc8))
212
+ #loc98 = loc("x1"(#loc9))
213
+ #loc99 = loc("x0"(#loc10))
214
+ #loc100 = loc("x2"(#loc11))
215
+ #loc101 = loc("_tmp46"(#loc2))
216
+ #loc102 = loc("r0_index"(#loc12))
217
+ #loc103 = loc("r0_mask"(#loc13))
218
+ #loc104 = loc("r0_4"(#loc14))
219
+ #loc105 = loc("r0_3"(#loc15))
220
+ #loc106 = loc("tmp0"(#loc16))
221
+ #loc107 = loc("tmp0"(#loc17))
222
+ #loc108 = loc("tmp2"(#loc18))
223
+ #loc109 = loc("tmp3"(#loc19))
224
+ #loc110 = loc("tmp3"(#loc20))
225
+ #loc111 = loc("tmp5"(#loc21))
226
+ #loc112 = loc("tmp6"(#loc22))
227
+ #loc113 = loc("tmp9"(#loc23))
228
+ #loc114 = loc("tmp10"(#loc24))
229
+ #loc115 = loc("tmp10"(#loc25))
230
+ #loc116 = loc("tmp10"(#loc26))
231
+ #loc117 = loc("tmp10"(#loc27))
232
+ #loc118 = loc("tmp11"(#loc28))
233
+ #loc119 = loc("tmp12"(#loc29))
234
+ #loc120 = loc("tmp13"(#loc30))
235
+ #loc121 = loc("tmp14"(#loc31))
236
+ #loc122 = loc("tmp17"(#loc32))
237
+ #loc123 = loc("tmp18"(#loc33))
238
+ #loc124 = loc("tmp19"(#loc34))
239
+ #loc125 = loc("tmp21"(#loc35))
240
+ #loc126 = loc("tmp22"(#loc36))
241
+ #loc127 = loc("tmp23"(#loc37))
242
+ #loc128 = loc("tmp24"(#loc38))
243
+ #loc129 = loc("tmp25"(#loc39))
244
+ #loc130 = loc("tmp26"(#loc40))
245
+ #loc131 = loc("tmp27"(#loc41))
246
+ #loc132 = loc("tmp28"(#loc42))
247
+ #loc133 = loc("tmp29"(#loc43))
248
+ #loc134 = loc("tmp30"(#loc44))
249
+ #loc135 = loc("tmp30"(#loc45))
250
+ #loc136 = loc("tmp30"(#loc46))
251
+ #loc137 = loc("tmp30"(#loc47))
252
+ #loc138 = loc("tmp31"(#loc48))
253
+ #loc139 = loc("tmp32"(#loc49))
254
+ #loc140 = loc("tmp33"(#loc50))
255
+ #loc141 = loc("tmp34"(#loc51))
256
+ #loc142 = loc("tmp35"(#loc52))
257
+ #loc143 = loc("tmp36"(#loc53))
258
+ #loc144 = loc("tmp37"(#loc54))
259
+ #loc145 = loc("tmp39"(#loc55))
260
+ #loc146 = loc("tmp40"(#loc56))
261
+ #loc147 = loc("tmp41"(#loc57))
262
+ #loc148 = loc("tmp43"(#loc58))
263
+ #loc149 = loc("tmp44"(#loc59))
264
+ #loc150 = loc("tmp47"(#loc60))
265
+ #loc151 = loc("_tmp46"(#loc61))
266
+ #loc152 = loc("_tmp46"(#loc62))
267
+ #loc154 = loc("tmp46"(#loc67))
268
+ #loc155 = loc("tmp49"(#loc68))
269
+ #loc156 = loc("tmp51"(#loc69))
270
+ #loc157 = loc("tmp52"(#loc70))
271
+ #loc158 = loc("tmp54"(#loc71))
272
+ #loc159 = loc("tmp53"(#loc72))
273
+ #loc160 = loc("tmp55"(#loc73))
274
+ #loc161 = loc("tmp57"(#loc74))
275
+ #loc162 = loc("tmp56"(#loc75))
276
+ #loc163 = loc(fused[#loc107, #loc106])
277
+ #loc164 = loc(fused[#loc110, #loc109])
278
+ #loc165 = loc(fused[#loc116, #loc94])
279
+ #loc166 = loc(fused[#loc136, #loc135])
280
+ #loc167 = loc(callsite(#loc64 at #loc153))
281
+ #loc169 = loc(fused[#loc158, #loc159])
282
+ #loc170 = loc(fused[#loc161, #loc162])
283
+ #loc171 = loc(callsite(#loc66 at #loc167))
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}}
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin ADDED
Binary file (43.9 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "dcf9c3d58e1d47ce5613d5d0ceadd19ad1e3325e6f9b526d3b2abd496024081a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 512, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"}
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir ADDED
@@ -0,0 +1,781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @global_smem = external addrspace(3) global [0 x i8], align 16
6
+
7
+ ; Function Attrs: nounwind
8
+ define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 {
9
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
10
+ %9 = shl i32 %8, 3, !dbg !8
11
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
12
+ %11 = and i32 %10, 24, !dbg !9
13
+ %12 = lshr i32 %10, 5, !dbg !9
14
+ %13 = and i32 %10, 7, !dbg !9
15
+ %14 = lshr i32 %10, 3, !dbg !9
16
+ %15 = and i32 %14, 7, !dbg !9
17
+ %16 = or disjoint i32 %9, %13, !dbg !10
18
+ %17 = or disjoint i32 %15, %9, !dbg !10
19
+ %18 = icmp slt i32 %16, 32, !dbg !11
20
+ %19 = icmp slt i32 %17, 32, !dbg !11
21
+ %20 = or disjoint i32 %15, 8, !dbg !12
22
+ %21 = shl nuw nsw i32 %13, 1, !dbg !12
23
+ %22 = sdiv i32 %16, 16, !dbg !13
24
+ %23 = mul nuw nsw i32 %15, 17, !dbg !14
25
+ %24 = mul nuw nsw i32 %20, 17, !dbg !14
26
+ %25 = shl i32 %22, 8, !dbg !15
27
+ %26 = add i32 %25, %16, !dbg !15
28
+ %27 = add i32 %26, %23, !dbg !16
29
+ %28 = add i32 %26, %24, !dbg !16
30
+ %29 = sext i32 %27 to i64, !dbg !17
31
+ %30 = getelementptr i32, ptr addrspace(1) %0, i64 %29, !dbg !17
32
+ %31 = sext i32 %28 to i64, !dbg !17
33
+ %32 = getelementptr i32, ptr addrspace(1) %0, i64 %31, !dbg !17
34
+ %33 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %30, i1 %18) #4, !dbg !18
35
+ %34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %32, i1 %18) #4, !dbg !18
36
+ %35 = lshr i32 %10, 4, !dbg !19
37
+ %.lobit = and i32 %35, 1, !dbg !19
38
+ %36 = and i32 %10, 8, !dbg !19
39
+ %.not = icmp eq i32 %36, 0, !dbg !19
40
+ %.lobit1 = lshr exact i32 %36, 3, !dbg !19
41
+ %37 = and i32 %10, 32, !dbg !19
42
+ %.not3 = icmp eq i32 %37, 0, !dbg !19
43
+ %.lobit2 = lshr exact i32 %37, 5, !dbg !19
44
+ %38 = xor i32 %.lobit1, 1, !dbg !23
45
+ %39 = xor i32 %.lobit, 1, !dbg !23
46
+ %40 = xor i32 %.lobit2, 1, !dbg !23
47
+ %41 = mul nuw nsw i32 %33, %38, !dbg !24
48
+ %42 = mul nuw nsw i32 %34, %38, !dbg !24
49
+ %43 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 8, i32 31), !dbg !25
50
+ %44 = add i32 %43, %41, !dbg !28
51
+ %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %42, i32 8, i32 31), !dbg !25
52
+ %46 = add i32 %45, %42, !dbg !28
53
+ %47 = mul nuw nsw i32 %33, %.lobit1, !dbg !29
54
+ %48 = mul nuw nsw i32 %34, %.lobit1, !dbg !29
55
+ %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 8, i32 31), !dbg !25
56
+ %50 = add i32 %49, %47, !dbg !28
57
+ %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !25
58
+ %52 = add i32 %51, %48, !dbg !28
59
+ %53 = mul nuw nsw i32 %38, %15, !dbg !30
60
+ %54 = mul nuw nsw i32 %20, %38, !dbg !30
61
+ %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 8, i32 31), !dbg !25
62
+ %56 = add i32 %55, %53, !dbg !28
63
+ %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 8, i32 31), !dbg !25
64
+ %58 = add i32 %57, %54, !dbg !28
65
+ %59 = mul nuw nsw i32 %15, %.lobit1, !dbg !31
66
+ %60 = mul nuw nsw i32 %20, %.lobit1, !dbg !31
67
+ %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 8, i32 31), !dbg !25
68
+ %62 = add i32 %61, %59, !dbg !28
69
+ %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 8, i32 31), !dbg !25
70
+ %64 = add i32 %63, %60, !dbg !28
71
+ %65 = trunc i32 %35 to i1, !dbg !32
72
+ %66 = icmp sge i32 %44, %50, !dbg !32
73
+ %67 = icmp ne i32 %44, %50, !dbg !32
74
+ %68 = icmp sle i32 %56, %62, !dbg !32
75
+ %69 = or i1 %67, %68, !dbg !32
76
+ %70 = and i1 %66, %69, !dbg !32
77
+ %.not4 = xor i1 %70, %65, !dbg !32
78
+ %71 = icmp sge i32 %46, %52, !dbg !32
79
+ %72 = icmp ne i32 %46, %52, !dbg !32
80
+ %73 = icmp sle i32 %58, %64, !dbg !32
81
+ %74 = or i1 %72, %73, !dbg !32
82
+ %75 = and i1 %71, %74, !dbg !32
83
+ %.not5 = xor i1 %75, %65, !dbg !32
84
+ %76 = xor i32 %50, %44, !dbg !33
85
+ %77 = xor i32 %52, %46, !dbg !33
86
+ %78 = select i1 %.not4, i32 0, i32 %76, !dbg !34
87
+ %79 = select i1 %.not5, i32 0, i32 %77, !dbg !34
88
+ %80 = xor i32 %78, %33, !dbg !35
89
+ %81 = xor i32 %79, %34, !dbg !35
90
+ %82 = xor i32 %62, %56, !dbg !36
91
+ %83 = xor i32 %64, %58, !dbg !36
92
+ %84 = select i1 %.not4, i32 0, i32 %82, !dbg !37
93
+ %85 = select i1 %.not5, i32 0, i32 %83, !dbg !37
94
+ %86 = xor i32 %84, %15, !dbg !38
95
+ %87 = xor i32 %85, %20, !dbg !38
96
+ %88 = mul nuw nsw i32 %80, %39, !dbg !24
97
+ %89 = mul nuw nsw i32 %81, %39, !dbg !24
98
+ %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 16, i32 31), !dbg !25
99
+ %91 = add i32 %88, %90, !dbg !28
100
+ %92 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 16, i32 31), !dbg !25
101
+ %93 = add i32 %89, %92, !dbg !28
102
+ %94 = mul nuw nsw i32 %80, %.lobit, !dbg !29
103
+ %95 = mul nuw nsw i32 %81, %.lobit, !dbg !29
104
+ %96 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 16, i32 31), !dbg !25
105
+ %97 = add i32 %94, %96, !dbg !28
106
+ %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %95, i32 16, i32 31), !dbg !25
107
+ %99 = add i32 %95, %98, !dbg !28
108
+ %100 = mul nuw nsw i32 %86, %39, !dbg !30
109
+ %101 = mul nuw nsw i32 %87, %39, !dbg !30
110
+ %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 16, i32 31), !dbg !25
111
+ %103 = add i32 %100, %102, !dbg !28
112
+ %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 16, i32 31), !dbg !25
113
+ %105 = add i32 %101, %104, !dbg !28
114
+ %106 = mul nuw nsw i32 %86, %.lobit, !dbg !31
115
+ %107 = mul nuw nsw i32 %87, %.lobit, !dbg !31
116
+ %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !25
117
+ %109 = add i32 %106, %108, !dbg !28
118
+ %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 16, i32 31), !dbg !25
119
+ %111 = add i32 %107, %110, !dbg !28
120
+ %112 = icmp slt i32 %91, %97, !dbg !39
121
+ %113 = icmp slt i32 %93, %99, !dbg !39
122
+ %114 = icmp eq i32 %91, %97, !dbg !40
123
+ %115 = icmp eq i32 %93, %99, !dbg !40
124
+ %116 = icmp sgt i32 %103, %109, !dbg !41
125
+ %117 = icmp sgt i32 %105, %111, !dbg !41
126
+ %118 = and i1 %114, %116, !dbg !42
127
+ %119 = and i1 %115, %117, !dbg !42
128
+ %120 = or i1 %112, %118, !dbg !43
129
+ %121 = or i1 %113, %119, !dbg !43
130
+ %122 = zext i1 %120 to i32, !dbg !44
131
+ %123 = zext i1 %121 to i32, !dbg !44
132
+ %.not6 = icmp eq i32 %.lobit2, %122, !dbg !32
133
+ %.not7 = icmp eq i32 %.lobit2, %123, !dbg !32
134
+ %124 = xor i32 %91, %97, !dbg !33
135
+ %125 = xor i32 %93, %99, !dbg !33
136
+ %126 = select i1 %.not6, i32 0, i32 %124, !dbg !34
137
+ %127 = select i1 %.not7, i32 0, i32 %125, !dbg !34
138
+ %128 = xor i32 %126, %80, !dbg !35
139
+ %129 = xor i32 %127, %81, !dbg !35
140
+ %130 = xor i32 %103, %109, !dbg !36
141
+ %131 = xor i32 %105, %111, !dbg !36
142
+ %132 = select i1 %.not6, i32 0, i32 %130, !dbg !37
143
+ %133 = select i1 %.not7, i32 0, i32 %131, !dbg !37
144
+ %134 = xor i32 %132, %86, !dbg !38
145
+ %135 = xor i32 %133, %87, !dbg !38
146
+ %136 = mul nuw nsw i32 %128, %38, !dbg !24
147
+ %137 = mul nuw nsw i32 %129, %38, !dbg !24
148
+ %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !25
149
+ %139 = add i32 %136, %138, !dbg !28
150
+ %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 8, i32 31), !dbg !25
151
+ %141 = add i32 %137, %140, !dbg !28
152
+ %142 = mul nuw nsw i32 %128, %.lobit1, !dbg !29
153
+ %143 = mul nuw nsw i32 %129, %.lobit1, !dbg !29
154
+ %144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 8, i32 31), !dbg !25
155
+ %145 = add i32 %142, %144, !dbg !28
156
+ %146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %143, i32 8, i32 31), !dbg !25
157
+ %147 = add i32 %143, %146, !dbg !28
158
+ %148 = mul nuw nsw i32 %134, %38, !dbg !30
159
+ %149 = mul nuw nsw i32 %135, %38, !dbg !30
160
+ %150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 8, i32 31), !dbg !25
161
+ %151 = add i32 %148, %150, !dbg !28
162
+ %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 8, i32 31), !dbg !25
163
+ %153 = add i32 %149, %152, !dbg !28
164
+ %154 = mul nuw nsw i32 %134, %.lobit1, !dbg !31
165
+ %155 = mul nuw nsw i32 %135, %.lobit1, !dbg !31
166
+ %156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 8, i32 31), !dbg !25
167
+ %157 = add i32 %154, %156, !dbg !28
168
+ %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 8, i32 31), !dbg !25
169
+ %159 = add i32 %155, %158, !dbg !28
170
+ %160 = icmp slt i32 %139, %145, !dbg !39
171
+ %161 = icmp slt i32 %141, %147, !dbg !39
172
+ %162 = icmp eq i32 %139, %145, !dbg !40
173
+ %163 = icmp eq i32 %141, %147, !dbg !40
174
+ %164 = icmp sgt i32 %151, %157, !dbg !41
175
+ %165 = icmp sgt i32 %153, %159, !dbg !41
176
+ %166 = and i1 %162, %164, !dbg !42
177
+ %167 = and i1 %163, %165, !dbg !42
178
+ %168 = or i1 %160, %166, !dbg !43
179
+ %169 = or i1 %161, %167, !dbg !43
180
+ %170 = zext i1 %168 to i32, !dbg !44
181
+ %171 = zext i1 %169 to i32, !dbg !44
182
+ %.not8 = icmp eq i32 %.lobit2, %170, !dbg !32
183
+ %.not9 = icmp eq i32 %.lobit2, %171, !dbg !32
184
+ %172 = xor i32 %139, %145, !dbg !33
185
+ %173 = xor i32 %141, %147, !dbg !33
186
+ %174 = select i1 %.not8, i32 0, i32 %172, !dbg !34
187
+ %175 = select i1 %.not9, i32 0, i32 %173, !dbg !34
188
+ %176 = xor i32 %174, %128, !dbg !35
189
+ %177 = xor i32 %175, %129, !dbg !35
190
+ %178 = xor i32 %151, %157, !dbg !36
191
+ %179 = xor i32 %153, %159, !dbg !36
192
+ %180 = select i1 %.not8, i32 0, i32 %178, !dbg !37
193
+ %181 = select i1 %.not9, i32 0, i32 %179, !dbg !37
194
+ %182 = xor i32 %180, %134, !dbg !38
195
+ %183 = xor i32 %181, %135, !dbg !38
196
+ %184 = mul nuw nsw i32 %176, %40, !dbg !24
197
+ %185 = mul nuw nsw i32 %177, %40, !dbg !24
198
+ %186 = and i32 %12, 1, !dbg !25
199
+ %187 = shl nuw nsw i32 %10, 1, !dbg !25
200
+ %188 = and i32 %187, 48, !dbg !25
201
+ %189 = or disjoint i32 %188, %21, !dbg !25
202
+ %.idx = shl nuw nsw i32 %189, 3, !dbg !25
203
+ %190 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !25
204
+ %191 = getelementptr i32, ptr addrspace(3) %190, i32 %186, !dbg !25
205
+ %192 = insertelement <1 x i32> poison, i32 %184, i64 0, !dbg !25
206
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %192, i1 true) #4, !dbg !25
207
+ %193 = getelementptr i8, ptr addrspace(3) %190, i32 8, !dbg !25
208
+ %194 = getelementptr i32, ptr addrspace(3) %193, i32 %186, !dbg !25
209
+ %195 = insertelement <1 x i32> poison, i32 %185, i64 0, !dbg !25
210
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %195, i1 true) #4, !dbg !25
211
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
212
+ %196 = icmp samesign ult i32 %10, 128, !dbg !25
213
+ %197 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %10, !dbg !25
214
+ %198 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
215
+ %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 1, i32 31), !dbg !25
216
+ %200 = add i32 %199, %198, !dbg !28
217
+ %201 = and i32 %10, 897, !dbg !25
218
+ %202 = icmp eq i32 %201, 0, !dbg !25
219
+ %203 = insertelement <1 x i32> poison, i32 %200, i64 0, !dbg !25
220
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %203, i1 %202) #4, !dbg !25
221
+ %204 = getelementptr i8, ptr addrspace(3) %197, i32 256, !dbg !25
222
+ %205 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
223
+ %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 1, i32 31), !dbg !25
224
+ %207 = add i32 %206, %205, !dbg !28
225
+ %208 = insertelement <1 x i32> poison, i32 %207, i64 0, !dbg !25
226
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %208, i1 %202) #4, !dbg !25
227
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
228
+ %209 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
229
+ %210 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
230
+ %211 = mul nuw nsw i32 %176, %.lobit2, !dbg !29
231
+ %212 = mul nuw nsw i32 %177, %.lobit2, !dbg !29
232
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
233
+ %213 = insertelement <1 x i32> poison, i32 %211, i64 0, !dbg !25
234
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %213, i1 true) #4, !dbg !25
235
+ %214 = insertelement <1 x i32> poison, i32 %212, i64 0, !dbg !25
236
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %214, i1 true) #4, !dbg !25
237
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
238
+ %215 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
239
+ %216 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 1, i32 31), !dbg !25
240
+ %217 = add i32 %216, %215, !dbg !28
241
+ %218 = insertelement <1 x i32> poison, i32 %217, i64 0, !dbg !25
242
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %218, i1 %202) #4, !dbg !25
243
+ %219 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
244
+ %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !25
245
+ %221 = add i32 %220, %219, !dbg !28
246
+ %222 = insertelement <1 x i32> poison, i32 %221, i64 0, !dbg !25
247
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %222, i1 %202) #4, !dbg !25
248
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
249
+ %223 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
250
+ %224 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
251
+ %225 = mul nuw nsw i32 %182, %40, !dbg !30
252
+ %226 = mul nuw nsw i32 %183, %40, !dbg !30
253
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
254
+ %227 = insertelement <1 x i32> poison, i32 %225, i64 0, !dbg !25
255
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %227, i1 true) #4, !dbg !25
256
+ %228 = insertelement <1 x i32> poison, i32 %226, i64 0, !dbg !25
257
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %228, i1 true) #4, !dbg !25
258
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
259
+ %229 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
260
+ %230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 1, i32 31), !dbg !25
261
+ %231 = add i32 %230, %229, !dbg !28
262
+ %232 = insertelement <1 x i32> poison, i32 %231, i64 0, !dbg !25
263
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %232, i1 %202) #4, !dbg !25
264
+ %233 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
265
+ %234 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %233, i32 1, i32 31), !dbg !25
266
+ %235 = add i32 %234, %233, !dbg !28
267
+ %236 = insertelement <1 x i32> poison, i32 %235, i64 0, !dbg !25
268
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %236, i1 %202) #4, !dbg !25
269
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
270
+ %237 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
271
+ %238 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
272
+ %239 = mul nuw nsw i32 %182, %.lobit2, !dbg !31
273
+ %240 = mul nuw nsw i32 %183, %.lobit2, !dbg !31
274
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
275
+ %241 = insertelement <1 x i32> poison, i32 %239, i64 0, !dbg !25
276
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %241, i1 true) #4, !dbg !25
277
+ %242 = insertelement <1 x i32> poison, i32 %240, i64 0, !dbg !25
278
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %242, i1 true) #4, !dbg !25
279
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
280
+ %243 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
281
+ %244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 1, i32 31), !dbg !25
282
+ %245 = add i32 %244, %243, !dbg !28
283
+ %246 = insertelement <1 x i32> poison, i32 %245, i64 0, !dbg !25
284
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %246, i1 %202) #4, !dbg !25
285
+ %247 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
286
+ %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 1, i32 31), !dbg !25
287
+ %249 = add i32 %248, %247, !dbg !28
288
+ %250 = insertelement <1 x i32> poison, i32 %249, i64 0, !dbg !25
289
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %250, i1 %202) #4, !dbg !25
290
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
291
+ %251 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
292
+ %252 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
293
+ %253 = icmp slt i32 %209, %223, !dbg !39
294
+ %254 = icmp sge i32 %210, %224, !dbg !39
295
+ %255 = icmp eq i32 %209, %223, !dbg !40
296
+ %256 = icmp ne i32 %210, %224, !dbg !40
297
+ %257 = icmp sgt i32 %237, %251, !dbg !41
298
+ %258 = icmp sle i32 %238, %252, !dbg !41
299
+ %259 = and i1 %255, %257, !dbg !42
300
+ %.not15 = or i1 %256, %258, !dbg !43
301
+ %260 = or i1 %253, %259, !dbg !43
302
+ %.not12 = and i1 %254, %.not15, !dbg !44
303
+ %261 = xor i32 %223, %209, !dbg !33
304
+ %262 = xor i32 %224, %210, !dbg !33
305
+ %263 = select i1 %260, i32 %261, i32 0, !dbg !34
306
+ %264 = select i1 %.not12, i32 %262, i32 0, !dbg !34
307
+ %265 = xor i32 %263, %176, !dbg !35
308
+ %266 = xor i32 %264, %177, !dbg !35
309
+ %267 = xor i32 %251, %237, !dbg !36
310
+ %268 = xor i32 %252, %238, !dbg !36
311
+ %269 = select i1 %260, i32 %267, i32 0, !dbg !37
312
+ %270 = select i1 %.not12, i32 %268, i32 0, !dbg !37
313
+ %271 = xor i32 %269, %182, !dbg !38
314
+ %272 = xor i32 %270, %183, !dbg !38
315
+ %273 = mul nuw nsw i32 %265, %39, !dbg !24
316
+ %274 = mul nuw nsw i32 %266, %39, !dbg !24
317
+ %275 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 16, i32 31), !dbg !25
318
+ %276 = add i32 %273, %275, !dbg !28
319
+ %277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %274, i32 16, i32 31), !dbg !25
320
+ %278 = add i32 %274, %277, !dbg !28
321
+ %279 = mul nuw nsw i32 %265, %.lobit, !dbg !29
322
+ %280 = mul nuw nsw i32 %266, %.lobit, !dbg !29
323
+ %281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %279, i32 16, i32 31), !dbg !25
324
+ %282 = add i32 %279, %281, !dbg !28
325
+ %283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 16, i32 31), !dbg !25
326
+ %284 = add i32 %280, %283, !dbg !28
327
+ %285 = mul nuw nsw i32 %271, %39, !dbg !30
328
+ %286 = mul nuw nsw i32 %272, %39, !dbg !30
329
+ %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 16, i32 31), !dbg !25
330
+ %288 = add i32 %285, %287, !dbg !28
331
+ %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 16, i32 31), !dbg !25
332
+ %290 = add i32 %286, %289, !dbg !28
333
+ %291 = mul nuw nsw i32 %271, %.lobit, !dbg !31
334
+ %292 = mul nuw nsw i32 %272, %.lobit, !dbg !31
335
+ %293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %291, i32 16, i32 31), !dbg !25
336
+ %294 = add i32 %293, %291, !dbg !28
337
+ %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 16, i32 31), !dbg !25
338
+ %296 = add i32 %295, %292, !dbg !28
339
+ %297 = icmp slt i32 %276, %282, !dbg !39
340
+ %298 = icmp sge i32 %278, %284, !dbg !39
341
+ %299 = icmp eq i32 %276, %282, !dbg !40
342
+ %300 = icmp ne i32 %278, %284, !dbg !40
343
+ %301 = icmp sgt i32 %288, %294, !dbg !41
344
+ %302 = icmp sle i32 %290, %296, !dbg !41
345
+ %303 = and i1 %299, %301, !dbg !42
346
+ %.not21 = or i1 %300, %302, !dbg !43
347
+ %304 = or i1 %297, %303, !dbg !43
348
+ %.not18 = and i1 %298, %.not21, !dbg !44
349
+ %305 = xor i32 %276, %282, !dbg !33
350
+ %306 = xor i32 %278, %284, !dbg !33
351
+ %307 = select i1 %304, i32 %305, i32 0, !dbg !34
352
+ %308 = select i1 %.not18, i32 %306, i32 0, !dbg !34
353
+ %309 = xor i32 %307, %265, !dbg !35
354
+ %310 = xor i32 %308, %266, !dbg !35
355
+ %311 = xor i32 %294, %288, !dbg !36
356
+ %312 = xor i32 %296, %290, !dbg !36
357
+ %313 = select i1 %304, i32 %311, i32 0, !dbg !37
358
+ %314 = select i1 %.not18, i32 %312, i32 0, !dbg !37
359
+ %315 = xor i32 %313, %271, !dbg !38
360
+ %316 = xor i32 %314, %272, !dbg !38
361
+ %317 = mul nuw nsw i32 %309, %38, !dbg !24
362
+ %318 = mul nuw nsw i32 %310, %38, !dbg !24
363
+ %319 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 8, i32 31), !dbg !25
364
+ %320 = add i32 %317, %319, !dbg !28
365
+ %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %318, i32 8, i32 31), !dbg !25
366
+ %322 = add i32 %318, %321, !dbg !28
367
+ %323 = mul nuw nsw i32 %309, %.lobit1, !dbg !29
368
+ %324 = mul nuw nsw i32 %310, %.lobit1, !dbg !29
369
+ %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %323, i32 8, i32 31), !dbg !25
370
+ %326 = add i32 %323, %325, !dbg !28
371
+ %327 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 8, i32 31), !dbg !25
372
+ %328 = add i32 %324, %327, !dbg !28
373
+ %329 = mul nuw nsw i32 %315, %38, !dbg !30
374
+ %330 = mul nuw nsw i32 %316, %38, !dbg !30
375
+ %331 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %329, i32 8, i32 31), !dbg !25
376
+ %332 = add i32 %329, %331, !dbg !28
377
+ %333 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %330, i32 8, i32 31), !dbg !25
378
+ %334 = add i32 %330, %333, !dbg !28
379
+ %335 = mul nuw nsw i32 %315, %.lobit1, !dbg !31
380
+ %336 = mul nuw nsw i32 %316, %.lobit1, !dbg !31
381
+ %337 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %335, i32 8, i32 31), !dbg !25
382
+ %338 = add i32 %337, %335, !dbg !28
383
+ %339 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 8, i32 31), !dbg !25
384
+ %340 = add i32 %339, %336, !dbg !28
385
+ %341 = icmp slt i32 %320, %326, !dbg !39
386
+ %342 = icmp sge i32 %322, %328, !dbg !39
387
+ %343 = icmp eq i32 %320, %326, !dbg !40
388
+ %344 = icmp ne i32 %322, %328, !dbg !40
389
+ %345 = icmp sgt i32 %332, %338, !dbg !41
390
+ %346 = icmp sle i32 %334, %340, !dbg !41
391
+ %347 = and i1 %343, %345, !dbg !42
392
+ %.not27 = or i1 %344, %346, !dbg !43
393
+ %348 = or i1 %341, %347, !dbg !43
394
+ %.not24 = and i1 %342, %.not27, !dbg !44
395
+ %349 = xor i32 %320, %326, !dbg !33
396
+ %350 = xor i32 %322, %328, !dbg !33
397
+ %351 = select i1 %348, i32 %349, i32 0, !dbg !34
398
+ %352 = select i1 %.not24, i32 %350, i32 0, !dbg !34
399
+ %353 = xor i32 %351, %309, !dbg !35
400
+ %354 = xor i32 %352, %310, !dbg !35
401
+ %355 = xor i32 %338, %332, !dbg !36
402
+ %356 = xor i32 %340, %334, !dbg !36
403
+ %357 = select i1 %348, i32 %355, i32 0, !dbg !37
404
+ %358 = select i1 %.not24, i32 %356, i32 0, !dbg !37
405
+ %359 = xor i32 %357, %315, !dbg !38
406
+ %360 = xor i32 %358, %316, !dbg !38
407
+ %361 = icmp slt i32 %353, %354, !dbg !39
408
+ %362 = icmp eq i32 %353, %354, !dbg !40
409
+ %363 = icmp sgt i32 %359, %360, !dbg !41
410
+ %364 = and i1 %362, %363, !dbg !42
411
+ %365 = or i1 %361, %364, !dbg !43
412
+ %366 = xor i32 %354, %353, !dbg !33
413
+ %367 = select i1 %365, i32 %366, i32 0, !dbg !34
414
+ %368 = xor i32 %367, %353, !dbg !35
415
+ %369 = xor i32 %367, %354, !dbg !35
416
+ %370 = xor i32 %360, %359, !dbg !36
417
+ %371 = select i1 %365, i32 %370, i32 0, !dbg !37
418
+ %372 = xor i32 %371, %359, !dbg !38
419
+ %373 = xor i32 %371, %360, !dbg !38
420
+ %374 = mul nuw nsw i32 %368, %40, !dbg !24
421
+ %375 = mul nuw nsw i32 %369, %40, !dbg !24
422
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
423
+ %376 = insertelement <1 x i32> poison, i32 %374, i64 0, !dbg !25
424
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %376, i1 true) #4, !dbg !25
425
+ %377 = insertelement <1 x i32> poison, i32 %375, i64 0, !dbg !25
426
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %377, i1 true) #4, !dbg !25
427
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
428
+ %378 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
429
+ %379 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %378, i32 1, i32 31), !dbg !25
430
+ %380 = add i32 %379, %378, !dbg !28
431
+ %381 = insertelement <1 x i32> poison, i32 %380, i64 0, !dbg !25
432
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %381, i1 %202) #4, !dbg !25
433
+ %382 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
434
+ %383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 1, i32 31), !dbg !25
435
+ %384 = add i32 %383, %382, !dbg !28
436
+ %385 = insertelement <1 x i32> poison, i32 %384, i64 0, !dbg !25
437
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %385, i1 %202) #4, !dbg !25
438
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
439
+ %386 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
440
+ %387 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
441
+ %388 = mul nuw nsw i32 %368, %.lobit2, !dbg !29
442
+ %389 = mul nuw nsw i32 %369, %.lobit2, !dbg !29
443
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
444
+ %390 = insertelement <1 x i32> poison, i32 %388, i64 0, !dbg !25
445
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %390, i1 true) #4, !dbg !25
446
+ %391 = insertelement <1 x i32> poison, i32 %389, i64 0, !dbg !25
447
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %391, i1 true) #4, !dbg !25
448
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
449
+ %392 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
450
+ %393 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %392, i32 1, i32 31), !dbg !25
451
+ %394 = add i32 %393, %392, !dbg !28
452
+ %395 = insertelement <1 x i32> poison, i32 %394, i64 0, !dbg !25
453
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %395, i1 %202) #4, !dbg !25
454
+ %396 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
455
+ %397 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %396, i32 1, i32 31), !dbg !25
456
+ %398 = add i32 %397, %396, !dbg !28
457
+ %399 = insertelement <1 x i32> poison, i32 %398, i64 0, !dbg !25
458
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %399, i1 %202) #4, !dbg !25
459
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
460
+ %400 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
461
+ %401 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
462
+ %402 = mul nuw nsw i32 %372, %40, !dbg !30
463
+ %403 = mul nuw nsw i32 %373, %40, !dbg !30
464
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
465
+ %404 = insertelement <1 x i32> poison, i32 %402, i64 0, !dbg !25
466
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %404, i1 true) #4, !dbg !25
467
+ %405 = insertelement <1 x i32> poison, i32 %403, i64 0, !dbg !25
468
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %405, i1 true) #4, !dbg !25
469
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
470
+ %406 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
471
+ %407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 1, i32 31), !dbg !25
472
+ %408 = add i32 %407, %406, !dbg !28
473
+ %409 = insertelement <1 x i32> poison, i32 %408, i64 0, !dbg !25
474
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %409, i1 %202) #4, !dbg !25
475
+ %410 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
476
+ %411 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %410, i32 1, i32 31), !dbg !25
477
+ %412 = add i32 %411, %410, !dbg !28
478
+ %413 = insertelement <1 x i32> poison, i32 %412, i64 0, !dbg !25
479
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %413, i1 %202) #4, !dbg !25
480
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
481
+ %414 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
482
+ %415 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
483
+ %416 = mul nuw nsw i32 %372, %.lobit2, !dbg !31
484
+ %417 = mul nuw nsw i32 %373, %.lobit2, !dbg !31
485
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
486
+ %418 = insertelement <1 x i32> poison, i32 %416, i64 0, !dbg !25
487
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %418, i1 true) #4, !dbg !25
488
+ %419 = insertelement <1 x i32> poison, i32 %417, i64 0, !dbg !25
489
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %419, i1 true) #4, !dbg !25
490
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
491
+ %420 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
492
+ %421 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %420, i32 1, i32 31), !dbg !25
493
+ %422 = add i32 %421, %420, !dbg !28
494
+ %423 = insertelement <1 x i32> poison, i32 %422, i64 0, !dbg !25
495
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %423, i1 %202) #4, !dbg !25
496
+ %424 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
497
+ %425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 1, i32 31), !dbg !25
498
+ %426 = add i32 %425, %424, !dbg !28
499
+ %427 = insertelement <1 x i32> poison, i32 %426, i64 0, !dbg !25
500
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %427, i1 %202) #4, !dbg !25
501
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
502
+ %428 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
503
+ %429 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
504
+ %430 = icmp slt i32 %386, %400, !dbg !39
505
+ %431 = icmp slt i32 %387, %401, !dbg !39
506
+ %432 = icmp eq i32 %386, %400, !dbg !40
507
+ %433 = icmp eq i32 %387, %401, !dbg !40
508
+ %434 = icmp sgt i32 %414, %428, !dbg !41
509
+ %435 = icmp sgt i32 %415, %429, !dbg !41
510
+ %436 = and i1 %432, %434, !dbg !42
511
+ %437 = and i1 %433, %435, !dbg !42
512
+ %438 = or i1 %430, %436, !dbg !43
513
+ %439 = or i1 %431, %437, !dbg !43
514
+ %440 = xor i32 %400, %386, !dbg !33
515
+ %441 = xor i32 %401, %387, !dbg !33
516
+ %442 = select i1 %438, i32 %440, i32 0, !dbg !34
517
+ %443 = select i1 %439, i32 %441, i32 0, !dbg !34
518
+ %444 = xor i32 %442, %368, !dbg !35
519
+ %445 = xor i32 %443, %369, !dbg !35
520
+ %446 = xor i32 %428, %414, !dbg !36
521
+ %447 = xor i32 %429, %415, !dbg !36
522
+ %448 = select i1 %438, i32 %446, i32 0, !dbg !37
523
+ %449 = select i1 %439, i32 %447, i32 0, !dbg !37
524
+ %450 = xor i32 %448, %372, !dbg !38
525
+ %451 = xor i32 %449, %373, !dbg !38
526
+ %452 = mul nuw nsw i32 %444, %39, !dbg !24
527
+ %453 = mul nuw nsw i32 %445, %39, !dbg !24
528
+ %454 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 16, i32 31), !dbg !25
529
+ %455 = add i32 %452, %454, !dbg !28
530
+ %456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %453, i32 16, i32 31), !dbg !25
531
+ %457 = add i32 %453, %456, !dbg !28
532
+ %458 = mul nuw nsw i32 %444, %.lobit, !dbg !29
533
+ %459 = mul nuw nsw i32 %445, %.lobit, !dbg !29
534
+ %460 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %458, i32 16, i32 31), !dbg !25
535
+ %461 = add i32 %458, %460, !dbg !28
536
+ %462 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %459, i32 16, i32 31), !dbg !25
537
+ %463 = add i32 %459, %462, !dbg !28
538
+ %464 = mul nuw nsw i32 %450, %39, !dbg !30
539
+ %465 = mul nuw nsw i32 %451, %39, !dbg !30
540
+ %466 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 16, i32 31), !dbg !25
541
+ %467 = add i32 %464, %466, !dbg !28
542
+ %468 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %465, i32 16, i32 31), !dbg !25
543
+ %469 = add i32 %465, %468, !dbg !28
544
+ %470 = mul nuw nsw i32 %450, %.lobit, !dbg !31
545
+ %471 = mul nuw nsw i32 %451, %.lobit, !dbg !31
546
+ %472 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 16, i32 31), !dbg !25
547
+ %473 = add i32 %472, %470, !dbg !28
548
+ %474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %471, i32 16, i32 31), !dbg !25
549
+ %475 = add i32 %474, %471, !dbg !28
550
+ %476 = icmp slt i32 %455, %461, !dbg !39
551
+ %477 = icmp slt i32 %457, %463, !dbg !39
552
+ %478 = icmp eq i32 %455, %461, !dbg !40
553
+ %479 = icmp eq i32 %457, %463, !dbg !40
554
+ %480 = icmp sgt i32 %467, %473, !dbg !41
555
+ %481 = icmp sgt i32 %469, %475, !dbg !41
556
+ %482 = and i1 %478, %480, !dbg !42
557
+ %483 = and i1 %479, %481, !dbg !42
558
+ %484 = or i1 %476, %482, !dbg !43
559
+ %485 = or i1 %477, %483, !dbg !43
560
+ %486 = xor i32 %455, %461, !dbg !33
561
+ %487 = xor i32 %457, %463, !dbg !33
562
+ %488 = select i1 %484, i32 %486, i32 0, !dbg !34
563
+ %489 = select i1 %485, i32 %487, i32 0, !dbg !34
564
+ %490 = xor i32 %488, %444, !dbg !35
565
+ %491 = xor i32 %489, %445, !dbg !35
566
+ %492 = xor i32 %473, %467, !dbg !36
567
+ %493 = xor i32 %475, %469, !dbg !36
568
+ %494 = select i1 %484, i32 %492, i32 0, !dbg !37
569
+ %495 = select i1 %485, i32 %493, i32 0, !dbg !37
570
+ %496 = xor i32 %494, %450, !dbg !38
571
+ %497 = xor i32 %495, %451, !dbg !38
572
+ %498 = mul nuw nsw i32 %490, %38, !dbg !24
573
+ %499 = mul nuw nsw i32 %491, %38, !dbg !24
574
+ %500 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 8, i32 31), !dbg !25
575
+ %501 = add i32 %498, %500, !dbg !28
576
+ %502 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %499, i32 8, i32 31), !dbg !25
577
+ %503 = add i32 %499, %502, !dbg !28
578
+ %504 = mul nuw nsw i32 %490, %.lobit1, !dbg !29
579
+ %505 = mul nuw nsw i32 %491, %.lobit1, !dbg !29
580
+ %506 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %504, i32 8, i32 31), !dbg !25
581
+ %507 = add i32 %504, %506, !dbg !28
582
+ %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %505, i32 8, i32 31), !dbg !25
583
+ %509 = add i32 %505, %508, !dbg !28
584
+ %510 = mul nuw nsw i32 %496, %38, !dbg !30
585
+ %511 = mul nuw nsw i32 %497, %38, !dbg !30
586
+ %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %510, i32 8, i32 31), !dbg !25
587
+ %513 = add i32 %510, %512, !dbg !28
588
+ %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 8, i32 31), !dbg !25
589
+ %515 = add i32 %511, %514, !dbg !28
590
+ %516 = mul nuw nsw i32 %496, %.lobit1, !dbg !31
591
+ %517 = mul nuw nsw i32 %497, %.lobit1, !dbg !31
592
+ %518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %516, i32 8, i32 31), !dbg !25
593
+ %519 = add i32 %518, %516, !dbg !28
594
+ %520 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %517, i32 8, i32 31), !dbg !25
595
+ %521 = add i32 %520, %517, !dbg !28
596
+ %522 = icmp slt i32 %501, %507, !dbg !39
597
+ %523 = icmp slt i32 %503, %509, !dbg !39
598
+ %524 = icmp eq i32 %501, %507, !dbg !40
599
+ %525 = icmp eq i32 %503, %509, !dbg !40
600
+ %526 = icmp sgt i32 %513, %519, !dbg !41
601
+ %527 = icmp sgt i32 %515, %521, !dbg !41
602
+ %528 = and i1 %524, %526, !dbg !42
603
+ %529 = and i1 %525, %527, !dbg !42
604
+ %530 = or i1 %522, %528, !dbg !43
605
+ %531 = or i1 %523, %529, !dbg !43
606
+ %532 = xor i32 %519, %513, !dbg !36
607
+ %533 = xor i32 %521, %515, !dbg !36
608
+ %534 = select i1 %530, i32 %532, i32 0, !dbg !37
609
+ %535 = select i1 %531, i32 %533, i32 0, !dbg !37
610
+ %536 = xor i32 %534, %496, !dbg !38
611
+ %537 = xor i32 %535, %497, !dbg !38
612
+ %narrow = select i1 %18, i32 %33, i32 0, !dbg !45
613
+ %538 = sext i32 %narrow to i64, !dbg !45
614
+ %narrow28 = select i1 %18, i32 %34, i32 0, !dbg !45
615
+ %539 = sext i32 %narrow28 to i64, !dbg !45
616
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46
617
+ %540 = add nsw i64 %539, %538, !dbg !48
618
+ %extelt.offset = lshr i64 %540, 32, !dbg !46
619
+ %541 = trunc nuw i64 %extelt.offset to i32, !dbg !46
620
+ %542 = trunc i64 %540 to i32, !dbg !46
621
+ %543 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %542, i32 16, i32 31), !dbg !46
622
+ %544 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 16, i32 31), !dbg !46
623
+ %545 = insertelement <2 x i32> poison, i32 %543, i64 0, !dbg !46
624
+ %546 = insertelement <2 x i32> %545, i32 %544, i64 1, !dbg !46
625
+ %547 = bitcast <2 x i32> %546 to i64, !dbg !46
626
+ %548 = add i64 %540, %547, !dbg !48
627
+ %extelt.offset29 = lshr i64 %548, 32, !dbg !46
628
+ %549 = trunc nuw i64 %extelt.offset29 to i32, !dbg !46
629
+ %550 = trunc i64 %548 to i32, !dbg !46
630
+ %551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %550, i32 8, i32 31), !dbg !46
631
+ %552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %549, i32 8, i32 31), !dbg !46
632
+ %553 = insertelement <2 x i32> poison, i32 %551, i64 0, !dbg !46
633
+ %554 = insertelement <2 x i32> %553, i32 %552, i64 1, !dbg !46
634
+ %555 = bitcast <2 x i32> %554 to i64, !dbg !46
635
+ %556 = add i64 %548, %555, !dbg !48
636
+ %557 = icmp eq i32 %11, 0, !dbg !46
637
+ %558 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %21, !dbg !46
638
+ %559 = getelementptr i64, ptr addrspace(3) %558, i32 %186, !dbg !46
639
+ %560 = insertelement <1 x i64> poison, i64 %556, i64 0, !dbg !46
640
+ tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %559, <1 x i64> %560, i1 %557) #4, !dbg !46
641
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46
642
+ %561 = icmp samesign ult i32 %10, 16, !dbg !46
643
+ %562 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %10, !dbg !46
644
+ %563 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %562, i1 %561) #4, !dbg !46
645
+ %extelt.offset30 = lshr i64 %563, 32, !dbg !46
646
+ %564 = trunc nuw i64 %extelt.offset30 to i32, !dbg !46
647
+ %565 = trunc i64 %563 to i32, !dbg !46
648
+ %566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 1, i32 31), !dbg !46
649
+ %567 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %564, i32 1, i32 31), !dbg !46
650
+ %568 = insertelement <2 x i32> poison, i32 %566, i64 0, !dbg !46
651
+ %569 = insertelement <2 x i32> %568, i32 %567, i64 1, !dbg !46
652
+ %570 = bitcast <2 x i32> %569 to i64, !dbg !46
653
+ %571 = add i64 %563, %570, !dbg !48
654
+ %572 = and i32 %10, 1009, !dbg !46
655
+ %573 = icmp eq i32 %572, 0, !dbg !46
656
+ %574 = insertelement <1 x i64> poison, i64 %571, i64 0, !dbg !46
657
+ tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %562, <1 x i64> %574, i1 %573) #4, !dbg !46
658
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46
659
+ %575 = load i64, ptr addrspace(3) %558, align 16, !dbg !46
660
+ %576 = trunc i64 %575 to i32, !dbg !49
661
+ %577 = shl i32 %17, 4, !dbg !50
662
+ %578 = or disjoint i32 %577, %21, !dbg !51
663
+ %579 = sext i32 %578 to i64, !dbg !52
664
+ %580 = getelementptr i32, ptr addrspace(1) %1, i64 %579, !dbg !52
665
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
666
+ %581 = and i32 %10, 3, !dbg !53
667
+ %582 = shl nuw nsw i32 %581, 3, !dbg !53
668
+ %583 = and i32 %187, 96, !dbg !53
669
+ %584 = and i32 %10, 4, !dbg !53
670
+ %585 = icmp eq i32 %584, 0, !dbg !53
671
+ %586 = select i1 %585, i32 0, i32 192, !dbg !53
672
+ %587 = select i1 %.not, i32 0, i32 260, !dbg !53
673
+ %588 = or disjoint i32 %582, %583, !dbg !53
674
+ %589 = xor i32 %588, %586, !dbg !53
675
+ %590 = or disjoint i32 %589, %587, !dbg !53
676
+ %591 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %590, !dbg !53
677
+ %592 = insertelement <1 x i32> poison, i32 %536, i64 0, !dbg !53
678
+ store <1 x i32> %592, ptr addrspace(3) %591, align 4, !dbg !53
679
+ %593 = xor i32 %590, 4, !dbg !53
680
+ %594 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %593, !dbg !53
681
+ %595 = insertelement <1 x i32> poison, i32 %537, i64 0, !dbg !53
682
+ store <1 x i32> %595, ptr addrspace(3) %594, align 4, !dbg !53
683
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
684
+ %596 = shl nuw nsw i32 %581, 5, !dbg !53
685
+ %597 = and i32 %10, 28, !dbg !53
686
+ %598 = select i1 %.not3, i32 0, i32 192, !dbg !53
687
+ %599 = or disjoint i32 %596, %597, !dbg !53
688
+ %600 = xor i32 %599, %598, !dbg !53
689
+ %601 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %600, !dbg !53
690
+ %602 = load i32, ptr addrspace(3) %601, align 4, !dbg !53
691
+ %603 = xor i32 %600, 260, !dbg !53
692
+ %604 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %603, !dbg !53
693
+ %605 = load i32, ptr addrspace(3) %604, align 4, !dbg !53
694
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %602, i32 %605, ptr addrspace(1) %580, i1 %19) #4, !dbg !53
695
+ %606 = sext i32 %16 to i64, !dbg !54
696
+ %607 = getelementptr i32, ptr addrspace(1) %2, i64 %606, !dbg !54
697
+ %608 = and i32 %10, 56, !dbg !55
698
+ %609 = icmp eq i32 %608, 0, !dbg !55
699
+ %610 = and i1 %609, %18, !dbg !55
700
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %576, ptr addrspace(1) %607, i1 %610) #4, !dbg !55
701
+ ret void, !dbg !56
702
+ }
703
+
704
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
705
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
706
+
707
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
708
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
709
+
710
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
711
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
712
+
713
+ ; Function Attrs: convergent nocallback nounwind
714
+ declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
715
+
716
+ attributes #0 = { nounwind "nvvm.reqntid"="64" }
717
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
718
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
719
+ attributes #3 = { convergent nocallback nounwind }
720
+ attributes #4 = { nounwind }
721
+
722
+ !llvm.dbg.cu = !{!0}
723
+ !llvm.module.flags = !{!2, !3}
724
+
725
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
726
+ !1 = !DIFile(filename: "chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx")
727
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
728
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
729
+ !4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
730
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
731
+ !6 = !{}
732
+ !7 = !DILocation(line: 24, column: 28, scope: !4)
733
+ !8 = !DILocation(line: 24, column: 33, scope: !4)
734
+ !9 = !DILocation(line: 25, column: 44, scope: !4)
735
+ !10 = !DILocation(line: 25, column: 23, scope: !4)
736
+ !11 = !DILocation(line: 26, column: 21, scope: !4)
737
+ !12 = !DILocation(line: 27, column: 38, scope: !4)
738
+ !13 = !DILocation(line: 34, column: 19, scope: !4)
739
+ !14 = !DILocation(line: 36, column: 38, scope: !4)
740
+ !15 = !DILocation(line: 36, column: 35, scope: !4)
741
+ !16 = !DILocation(line: 36, column: 45, scope: !4)
742
+ !17 = !DILocation(line: 36, column: 30, scope: !4)
743
+ !18 = !DILocation(line: 36, column: 54, scope: !4)
744
+ !19 = !DILocation(line: 627, column: 44, scope: !20, inlinedAt: !22)
745
+ !20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0)
746
+ !21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
747
+ !22 = !DILocation(line: 41, column: 67, scope: !4)
748
+ !23 = !DILocation(line: 537, column: 21, scope: !20, inlinedAt: !22)
749
+ !24 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !22)
750
+ !25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !22)
751
+ !26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0)
752
+ !27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language")
753
+ !28 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !22)
754
+ !29 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !22)
755
+ !30 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !22)
756
+ !31 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !22)
757
+ !32 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !22)
758
+ !33 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !22)
759
+ !34 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !22)
760
+ !35 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !22)
761
+ !36 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !22)
762
+ !37 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !22)
763
+ !38 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !22)
764
+ !39 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !22)
765
+ !40 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !22)
766
+ !41 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !22)
767
+ !42 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !22)
768
+ !43 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !22)
769
+ !44 = !DILocation(line: 599, column: 19, scope: !20, inlinedAt: !22)
770
+ !45 = !DILocation(line: 44, column: 34, scope: !4)
771
+ !46 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !47)
772
+ !47 = !DILocation(line: 45, column: 26, scope: !4)
773
+ !48 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !47)
774
+ !49 = !DILocation(line: 48, column: 21, scope: !4)
775
+ !50 = !DILocation(line: 49, column: 35, scope: !4)
776
+ !51 = !DILocation(line: 49, column: 32, scope: !4)
777
+ !52 = !DILocation(line: 49, column: 25, scope: !4)
778
+ !53 = !DILocation(line: 49, column: 47, scope: !4)
779
+ !54 = !DILocation(line: 50, column: 25, scope: !4)
780
+ !55 = !DILocation(line: 50, column: 37, scope: !4)
781
+ !56 = !DILocation(line: 50, column: 4, scope: !4)
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx ADDED
@@ -0,0 +1,1410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3
10
+ .extern .shared .align 16 .b8 global_smem[];
11
+ // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3
12
+ .visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(
13
+ .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0,
14
+ .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1,
15
+ .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2,
16
+ .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3,
17
+ .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4,
18
+ .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5,
19
+ .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6
20
+ )
21
+ .reqntid 64
22
+ {
23
+ .reg .pred %p<155>;
24
+ .reg .b32 %r<504>;
25
+ .reg .b64 %rd<27>;
26
+ .loc 1 18 0 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:18:0
27
+ $L__func_begin0:
28
+ .loc 1 18 0 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:18:0
29
+
30
+ // %bb.0:
31
+ ld.param.b64 %rd8, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0];
32
+ ld.param.b64 %rd9, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1];
33
+ $L__tmp0:
34
+ .loc 1 24 28 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:24:28
35
+ mov.u32 %r105, %ctaid.x;
36
+ .loc 1 24 33 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:24:33
37
+ shl.b32 %r106, %r105, 3;
38
+ ld.param.b64 %rd10, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2];
39
+ .loc 1 25 44 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:25:44
40
+ mov.u32 %r107, %tid.x;
41
+ and.b32 %r108, %r107, 24;
42
+ and.b32 %r109, %r107, 7;
43
+ bfe.u32 %r110, %r107, 3, 3;
44
+ .loc 1 25 23 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:25:23
45
+ or.b32 %r111, %r106, %r109;
46
+ or.b32 %r112, %r110, %r106;
47
+ .loc 1 26 21 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:26:21
48
+ setp.lt.s32 %p1, %r111, 32;
49
+ setp.lt.s32 %p54, %r112, 32;
50
+ .loc 1 27 38 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:27:38
51
+ or.b32 %r113, %r110, 8;
52
+ shl.b32 %r114, %r109, 1;
53
+ .loc 1 34 19 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:34:19
54
+ bfe.s32 %r115, %r105, 28, 1;
55
+ shr.u32 %r116, %r115, 28;
56
+ add.s32 %r117, %r111, %r116;
57
+ .loc 1 36 35 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:35
58
+ shl.b32 %r118, %r117, 4;
59
+ and.b32 %r119, %r118, -256;
60
+ add.s32 %r120, %r119, %r111;
61
+ .loc 1 36 45 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:45
62
+ mad.lo.s32 %r121, %r110, 17, %r120;
63
+ add.s32 %r122, %r121, 136;
64
+ .loc 1 36 30 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:30
65
+ mad.wide.s32 %rd1, %r121, 4, %rd8;
66
+ mad.wide.s32 %rd2, %r122, 4, %rd8;
67
+ .loc 1 36 54 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:54
68
+ // begin inline asm
69
+ mov.u32 %r1, 0x0;
70
+ @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];
71
+ // end inline asm
72
+ // begin inline asm
73
+ mov.u32 %r2, 0x0;
74
+ @%p1 ld.global.b32 { %r2 }, [ %rd2 + 0 ];
75
+ // end inline asm
76
+ $L__tmp1:
77
+ .loc 2 627 44 // triton_helpers.py:627:44 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
78
+ shr.u32 %r123, %r107, 4;
79
+ bfe.u32 %r124, %r107, 4, 1;
80
+ bfe.s32 %r125, %r107, 3, 1;
81
+ and.b32 %r126, %r107, 8;
82
+ bfe.u32 %r127, %r107, 3, 1;
83
+ bfe.s32 %r128, %r107, 5, 1;
84
+ bfe.u32 %r129, %r107, 5, 1;
85
+ .loc 2 537 21 // triton_helpers.py:537:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
86
+ xor.b32 %r130, %r127, 1;
87
+ xor.b32 %r131, %r124, 1;
88
+ xor.b32 %r132, %r129, 1;
89
+ .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
90
+ mul.lo.s32 %r133, %r1, %r130;
91
+ mul.lo.s32 %r134, %r2, %r130;
92
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
93
+ shfl.sync.bfly.b32 %r135, %r133, 8, 31, -1;
94
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
95
+ add.s32 %r136, %r135, %r133;
96
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
97
+ shfl.sync.bfly.b32 %r137, %r134, 8, 31, -1;
98
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
99
+ add.s32 %r138, %r137, %r134;
100
+ .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
101
+ mul.lo.s32 %r139, %r1, %r127;
102
+ mul.lo.s32 %r140, %r2, %r127;
103
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
104
+ shfl.sync.bfly.b32 %r141, %r139, 8, 31, -1;
105
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
106
+ add.s32 %r142, %r141, %r139;
107
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
108
+ shfl.sync.bfly.b32 %r143, %r140, 8, 31, -1;
109
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
110
+ add.s32 %r144, %r143, %r140;
111
+ .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
112
+ mul.lo.s32 %r145, %r130, %r110;
113
+ shl.b32 %r146, %r130, 3;
114
+ or.b32 %r147, %r145, %r146;
115
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
116
+ shfl.sync.bfly.b32 %r148, %r145, 8, 31, -1;
117
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
118
+ add.s32 %r149, %r148, %r145;
119
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
120
+ shfl.sync.bfly.b32 %r150, %r147, 8, 31, -1;
121
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
122
+ add.s32 %r151, %r150, %r147;
123
+ .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
124
+ mul.lo.s32 %r152, %r110, %r127;
125
+ or.b32 %r153, %r152, %r126;
126
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
127
+ shfl.sync.bfly.b32 %r154, %r152, 8, 31, -1;
128
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
129
+ add.s32 %r155, %r154, %r152;
130
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
131
+ shfl.sync.bfly.b32 %r156, %r153, 8, 31, -1;
132
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
133
+ add.s32 %r157, %r156, %r153;
134
+ .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
135
+ and.b32 %r158, %r123, 1;
136
+ setp.ne.b32 %p56, %r158, 0;
137
+ setp.ge.s32 %p57, %r136, %r142;
138
+ setp.ne.b32 %p58, %r136, %r142;
139
+ setp.le.s32 %p59, %r149, %r155;
140
+ or.pred %p60, %p58, %p59;
141
+ and.pred %p61, %p57, %p60;
142
+ xor.pred %p62, %p61, %p56;
143
+ setp.ge.s32 %p63, %r138, %r144;
144
+ setp.ne.b32 %p64, %r138, %r144;
145
+ setp.le.s32 %p65, %r151, %r157;
146
+ or.pred %p66, %p64, %p65;
147
+ and.pred %p67, %p63, %p66;
148
+ xor.pred %p68, %p67, %p56;
149
+ .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
150
+ xor.b32 %r159, %r142, %r136;
151
+ xor.b32 %r160, %r144, %r138;
152
+ .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
153
+ selp.b32 %r161, 0, %r159, %p62;
154
+ selp.b32 %r162, 0, %r160, %p68;
155
+ .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
156
+ xor.b32 %r163, %r161, %r1;
157
+ xor.b32 %r164, %r162, %r2;
158
+ .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
159
+ xor.b32 %r165, %r155, %r149;
160
+ xor.b32 %r166, %r157, %r151;
161
+ .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
162
+ selp.b32 %r167, 0, %r165, %p62;
163
+ selp.b32 %r168, 0, %r166, %p68;
164
+ .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
165
+ xor.b32 %r169, %r167, %r110;
166
+ xor.b32 %r170, %r168, %r113;
167
+ .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
168
+ mul.lo.s32 %r171, %r163, %r131;
169
+ mul.lo.s32 %r172, %r164, %r131;
170
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
171
+ shfl.sync.bfly.b32 %r173, %r171, 16, 31, -1;
172
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
173
+ add.s32 %r174, %r171, %r173;
174
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
175
+ shfl.sync.bfly.b32 %r175, %r172, 16, 31, -1;
176
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
177
+ add.s32 %r176, %r172, %r175;
178
+ .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
179
+ mul.lo.s32 %r177, %r163, %r124;
180
+ mul.lo.s32 %r178, %r164, %r124;
181
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
182
+ shfl.sync.bfly.b32 %r179, %r177, 16, 31, -1;
183
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
184
+ add.s32 %r180, %r177, %r179;
185
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
186
+ shfl.sync.bfly.b32 %r181, %r178, 16, 31, -1;
187
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
188
+ add.s32 %r182, %r178, %r181;
189
+ .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
190
+ mul.lo.s32 %r183, %r169, %r131;
191
+ mul.lo.s32 %r184, %r170, %r131;
192
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
193
+ shfl.sync.bfly.b32 %r185, %r183, 16, 31, -1;
194
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
195
+ add.s32 %r186, %r183, %r185;
196
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
197
+ shfl.sync.bfly.b32 %r187, %r184, 16, 31, -1;
198
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
199
+ add.s32 %r188, %r184, %r187;
200
+ .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
201
+ mul.lo.s32 %r189, %r169, %r124;
202
+ mul.lo.s32 %r190, %r170, %r124;
203
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
204
+ shfl.sync.bfly.b32 %r191, %r189, 16, 31, -1;
205
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
206
+ add.s32 %r192, %r189, %r191;
207
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
208
+ shfl.sync.bfly.b32 %r193, %r190, 16, 31, -1;
209
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
210
+ add.s32 %r194, %r190, %r193;
211
+ .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
212
+ setp.lt.s32 %p69, %r174, %r180;
213
+ setp.lt.s32 %p70, %r176, %r182;
214
+ .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
215
+ setp.eq.b32 %p71, %r174, %r180;
216
+ setp.eq.b32 %p72, %r176, %r182;
217
+ .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
218
+ setp.gt.s32 %p73, %r186, %r192;
219
+ setp.gt.s32 %p74, %r188, %r194;
220
+ .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
221
+ and.pred %p75, %p71, %p73;
222
+ and.pred %p76, %p72, %p74;
223
+ .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
224
+ or.pred %p77, %p69, %p75;
225
+ or.pred %p78, %p70, %p76;
226
+ .loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
227
+ selp.b32 %r195, 1, 0, %p77;
228
+ selp.b32 %r196, 1, 0, %p78;
229
+ .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
230
+ setp.eq.b32 %p79, %r129, %r195;
231
+ setp.eq.b32 %p80, %r129, %r196;
232
+ .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
233
+ xor.b32 %r197, %r174, %r180;
234
+ xor.b32 %r198, %r176, %r182;
235
+ .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
236
+ selp.b32 %r199, 0, %r197, %p79;
237
+ selp.b32 %r200, 0, %r198, %p80;
238
+ .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
239
+ xor.b32 %r201, %r199, %r163;
240
+ xor.b32 %r202, %r200, %r164;
241
+ .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
242
+ xor.b32 %r203, %r186, %r192;
243
+ xor.b32 %r204, %r188, %r194;
244
+ .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
245
+ selp.b32 %r205, 0, %r203, %p79;
246
+ selp.b32 %r206, 0, %r204, %p80;
247
+ .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
248
+ xor.b32 %r207, %r205, %r169;
249
+ xor.b32 %r208, %r206, %r170;
250
+ .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
251
+ mul.lo.s32 %r209, %r201, %r130;
252
+ mul.lo.s32 %r210, %r202, %r130;
253
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
254
+ shfl.sync.bfly.b32 %r211, %r209, 8, 31, -1;
255
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
256
+ add.s32 %r212, %r209, %r211;
257
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
258
+ shfl.sync.bfly.b32 %r213, %r210, 8, 31, -1;
259
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
260
+ add.s32 %r214, %r210, %r213;
261
+ .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
262
+ mul.lo.s32 %r215, %r201, %r127;
263
+ mul.lo.s32 %r216, %r202, %r127;
264
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
265
+ shfl.sync.bfly.b32 %r217, %r215, 8, 31, -1;
266
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
267
+ add.s32 %r218, %r215, %r217;
268
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
269
+ shfl.sync.bfly.b32 %r219, %r216, 8, 31, -1;
270
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
271
+ add.s32 %r220, %r216, %r219;
272
+ .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
273
+ mul.lo.s32 %r221, %r207, %r130;
274
+ mul.lo.s32 %r222, %r208, %r130;
275
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
276
+ shfl.sync.bfly.b32 %r223, %r221, 8, 31, -1;
277
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
278
+ add.s32 %r224, %r221, %r223;
279
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
280
+ shfl.sync.bfly.b32 %r225, %r222, 8, 31, -1;
281
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
282
+ add.s32 %r226, %r222, %r225;
283
+ .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
284
+ mul.lo.s32 %r227, %r207, %r127;
285
+ mul.lo.s32 %r228, %r208, %r127;
286
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
287
+ shfl.sync.bfly.b32 %r229, %r227, 8, 31, -1;
288
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
289
+ add.s32 %r230, %r227, %r229;
290
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
291
+ shfl.sync.bfly.b32 %r231, %r228, 8, 31, -1;
292
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
293
+ add.s32 %r232, %r228, %r231;
294
+ .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
295
+ setp.lt.s32 %p81, %r212, %r218;
296
+ setp.lt.s32 %p82, %r214, %r220;
297
+ .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
298
+ setp.eq.b32 %p83, %r212, %r218;
299
+ setp.eq.b32 %p84, %r214, %r220;
300
+ .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
301
+ setp.gt.s32 %p85, %r224, %r230;
302
+ setp.gt.s32 %p86, %r226, %r232;
303
+ .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
304
+ and.pred %p87, %p83, %p85;
305
+ and.pred %p88, %p84, %p86;
306
+ .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
307
+ or.pred %p89, %p81, %p87;
308
+ or.pred %p90, %p82, %p88;
309
+ .loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
310
+ selp.b32 %r233, 1, 0, %p89;
311
+ selp.b32 %r234, 1, 0, %p90;
312
+ .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
313
+ setp.eq.b32 %p91, %r129, %r233;
314
+ setp.eq.b32 %p92, %r129, %r234;
315
+ .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
316
+ xor.b32 %r235, %r212, %r218;
317
+ xor.b32 %r236, %r214, %r220;
318
+ .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
319
+ selp.b32 %r237, 0, %r235, %p91;
320
+ selp.b32 %r238, 0, %r236, %p92;
321
+ .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
322
+ xor.b32 %r239, %r237, %r201;
323
+ xor.b32 %r240, %r238, %r202;
324
+ .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
325
+ xor.b32 %r241, %r224, %r230;
326
+ xor.b32 %r242, %r226, %r232;
327
+ .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
328
+ selp.b32 %r243, 0, %r241, %p91;
329
+ selp.b32 %r244, 0, %r242, %p92;
330
+ .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
331
+ xor.b32 %r245, %r243, %r207;
332
+ xor.b32 %r246, %r244, %r208;
333
+ .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
334
+ mul.lo.s32 %r4, %r239, %r132;
335
+ mul.lo.s32 %r6, %r240, %r132;
336
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
337
+ shl.b32 %r247, %r107, 1;
338
+ and.b32 %r248, %r247, 48;
339
+ or.b32 %r249, %r248, %r114;
340
+ shl.b32 %r250, %r249, 3;
341
+ mov.b32 %r251, global_smem;
342
+ add.s32 %r252, %r251, %r250;
343
+ shl.b32 %r253, %r129, 2;
344
+ add.s32 %r3, %r252, %r253;
345
+ mov.pred %p3, -1;
346
+ // begin inline asm
347
+ @%p3 st.shared.b32 [ %r3 + 0 ], %r4;
348
+ // end inline asm
349
+ add.s32 %r5, %r3, 8;
350
+ // begin inline asm
351
+ @%p3 st.shared.b32 [ %r5 + 0 ], %r6;
352
+ // end inline asm
353
+ bar.sync 0;
354
+ setp.lt.u32 %p5, %r107, 128;
355
+ shl.b32 %r254, %r107, 2;
356
+ add.s32 %r8, %r251, %r254;
357
+ // begin inline asm
358
+ @%p5 ld.shared.b32 %r7, [ %r8 + 0 ];
359
+ // end inline asm
360
+ shfl.sync.bfly.b32 %r255, %r7, 1, 31, -1;
361
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
362
+ add.s32 %r10, %r255, %r7;
363
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
364
+ and.b32 %r256, %r107, 897;
365
+ setp.eq.b32 %p6, %r256, 0;
366
+ // begin inline asm
367
+ @%p6 st.shared.b32 [ %r8 + 0 ], %r10;
368
+ // end inline asm
369
+ add.s32 %r12, %r8, 256;
370
+ // begin inline asm
371
+ @%p5 ld.shared.b32 %r11, [ %r12 + 0 ];
372
+ // end inline asm
373
+ shfl.sync.bfly.b32 %r257, %r11, 1, 31, -1;
374
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
375
+ add.s32 %r14, %r257, %r11;
376
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
377
+ // begin inline asm
378
+ @%p6 st.shared.b32 [ %r12 + 0 ], %r14;
379
+ // end inline asm
380
+ bar.sync 0;
381
+ ld.shared.b32 %r258, [%r252];
382
+ ld.shared.b32 %r259, [%r252+8];
383
+ .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
384
+ mul.lo.s32 %r16, %r239, %r129;
385
+ mul.lo.s32 %r18, %r240, %r129;
386
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
387
+ bar.sync 0;
388
+ // begin inline asm
389
+ @%p3 st.shared.b32 [ %r3 + 0 ], %r16;
390
+ // end inline asm
391
+ // begin inline asm
392
+ @%p3 st.shared.b32 [ %r5 + 0 ], %r18;
393
+ // end inline asm
394
+ bar.sync 0;
395
+ // begin inline asm
396
+ @%p5 ld.shared.b32 %r19, [ %r8 + 0 ];
397
+ // end inline asm
398
+ shfl.sync.bfly.b32 %r260, %r19, 1, 31, -1;
399
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
400
+ add.s32 %r22, %r260, %r19;
401
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
402
+ // begin inline asm
403
+ @%p6 st.shared.b32 [ %r8 + 0 ], %r22;
404
+ // end inline asm
405
+ // begin inline asm
406
+ @%p5 ld.shared.b32 %r23, [ %r12 + 0 ];
407
+ // end inline asm
408
+ shfl.sync.bfly.b32 %r261, %r23, 1, 31, -1;
409
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
410
+ add.s32 %r26, %r261, %r23;
411
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
412
+ // begin inline asm
413
+ @%p6 st.shared.b32 [ %r12 + 0 ], %r26;
414
+ // end inline asm
415
+ bar.sync 0;
416
+ ld.shared.b32 %r262, [%r252];
417
+ ld.shared.b32 %r263, [%r252+8];
418
+ .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
419
+ mul.lo.s32 %r28, %r245, %r132;
420
+ mul.lo.s32 %r30, %r246, %r132;
421
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
422
+ bar.sync 0;
423
+ // begin inline asm
424
+ @%p3 st.shared.b32 [ %r3 + 0 ], %r28;
425
+ // end inline asm
426
+ // begin inline asm
427
+ @%p3 st.shared.b32 [ %r5 + 0 ], %r30;
428
+ // end inline asm
429
+ bar.sync 0;
430
+ // begin inline asm
431
+ @%p5 ld.shared.b32 %r31, [ %r8 + 0 ];
432
+ // end inline asm
433
+ shfl.sync.bfly.b32 %r264, %r31, 1, 31, -1;
434
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
435
+ add.s32 %r34, %r264, %r31;
436
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
437
+ // begin inline asm
438
+ @%p6 st.shared.b32 [ %r8 + 0 ], %r34;
439
+ // end inline asm
440
+ // begin inline asm
441
+ @%p5 ld.shared.b32 %r35, [ %r12 + 0 ];
442
+ // end inline asm
443
+ shfl.sync.bfly.b32 %r265, %r35, 1, 31, -1;
444
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
445
+ add.s32 %r38, %r265, %r35;
446
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
447
+ // begin inline asm
448
+ @%p6 st.shared.b32 [ %r12 + 0 ], %r38;
449
+ // end inline asm
450
+ bar.sync 0;
451
+ ld.shared.b32 %r266, [%r252];
452
+ ld.shared.b32 %r267, [%r252+8];
453
+ .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
454
+ mul.lo.s32 %r40, %r245, %r129;
455
+ mul.lo.s32 %r42, %r246, %r129;
456
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
457
+ bar.sync 0;
458
+ // begin inline asm
459
+ @%p3 st.shared.b32 [ %r3 + 0 ], %r40;
460
+ // end inline asm
461
+ // begin inline asm
462
+ @%p3 st.shared.b32 [ %r5 + 0 ], %r42;
463
+ // end inline asm
464
+ bar.sync 0;
465
+ // begin inline asm
466
+ @%p5 ld.shared.b32 %r43, [ %r8 + 0 ];
467
+ // end inline asm
468
+ shfl.sync.bfly.b32 %r268, %r43, 1, 31, -1;
469
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
470
+ add.s32 %r46, %r268, %r43;
471
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
472
+ // begin inline asm
473
+ @%p6 st.shared.b32 [ %r8 + 0 ], %r46;
474
+ // end inline asm
475
+ // begin inline asm
476
+ @%p5 ld.shared.b32 %r47, [ %r12 + 0 ];
477
+ // end inline asm
478
+ shfl.sync.bfly.b32 %r269, %r47, 1, 31, -1;
479
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
480
+ add.s32 %r50, %r269, %r47;
481
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
482
+ // begin inline asm
483
+ @%p6 st.shared.b32 [ %r12 + 0 ], %r50;
484
+ // end inline asm
485
+ bar.sync 0;
486
+ ld.shared.b32 %r270, [%r252];
487
+ ld.shared.b32 %r271, [%r252+8];
488
+ .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
489
+ setp.lt.s32 %p93, %r258, %r262;
490
+ setp.ge.s32 %p94, %r259, %r263;
491
+ .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
492
+ setp.eq.b32 %p95, %r258, %r262;
493
+ setp.ne.b32 %p96, %r259, %r263;
494
+ .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
495
+ setp.gt.s32 %p97, %r266, %r270;
496
+ setp.le.s32 %p98, %r267, %r271;
497
+ .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
498
+ and.pred %p99, %p95, %p97;
499
+ .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
500
+ or.pred %p100, %p96, %p98;
501
+ or.pred %p101, %p93, %p99;
502
+ .loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
503
+ and.pred %p102, %p94, %p100;
504
+ .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
505
+ xor.b32 %r272, %r262, %r258;
506
+ xor.b32 %r273, %r263, %r259;
507
+ .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
508
+ selp.b32 %r274, %r272, 0, %p101;
509
+ selp.b32 %r275, %r273, 0, %p102;
510
+ .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
511
+ xor.b32 %r276, %r274, %r239;
512
+ xor.b32 %r277, %r275, %r240;
513
+ .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
514
+ xor.b32 %r278, %r270, %r266;
515
+ xor.b32 %r279, %r271, %r267;
516
+ .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
517
+ selp.b32 %r280, %r278, 0, %p101;
518
+ selp.b32 %r281, %r279, 0, %p102;
519
+ .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
520
+ xor.b32 %r282, %r280, %r245;
521
+ xor.b32 %r283, %r281, %r246;
522
+ .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
523
+ mul.lo.s32 %r284, %r276, %r131;
524
+ mul.lo.s32 %r285, %r277, %r131;
525
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
526
+ shfl.sync.bfly.b32 %r286, %r284, 16, 31, -1;
527
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
528
+ add.s32 %r287, %r284, %r286;
529
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
530
+ shfl.sync.bfly.b32 %r288, %r285, 16, 31, -1;
531
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
532
+ add.s32 %r289, %r285, %r288;
533
+ .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
534
+ mul.lo.s32 %r290, %r276, %r124;
535
+ mul.lo.s32 %r291, %r277, %r124;
536
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
537
+ shfl.sync.bfly.b32 %r292, %r290, 16, 31, -1;
538
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
539
+ add.s32 %r293, %r290, %r292;
540
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
541
+ shfl.sync.bfly.b32 %r294, %r291, 16, 31, -1;
542
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
543
+ add.s32 %r295, %r291, %r294;
544
+ .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
545
+ mul.lo.s32 %r296, %r282, %r131;
546
+ mul.lo.s32 %r297, %r283, %r131;
547
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
548
+ shfl.sync.bfly.b32 %r298, %r296, 16, 31, -1;
549
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
550
+ add.s32 %r299, %r296, %r298;
551
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
552
+ shfl.sync.bfly.b32 %r300, %r297, 16, 31, -1;
553
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
554
+ add.s32 %r301, %r297, %r300;
555
+ .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
556
+ mul.lo.s32 %r302, %r282, %r124;
557
+ mul.lo.s32 %r303, %r283, %r124;
558
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
559
+ shfl.sync.bfly.b32 %r304, %r302, 16, 31, -1;
560
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
561
+ add.s32 %r305, %r304, %r302;
562
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
563
+ shfl.sync.bfly.b32 %r306, %r303, 16, 31, -1;
564
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
565
+ add.s32 %r307, %r306, %r303;
566
+ .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
567
+ setp.lt.s32 %p103, %r287, %r293;
568
+ setp.ge.s32 %p104, %r289, %r295;
569
+ .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
570
+ setp.eq.b32 %p105, %r287, %r293;
571
+ setp.ne.b32 %p106, %r289, %r295;
572
+ .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
573
+ setp.gt.s32 %p107, %r299, %r305;
574
+ setp.le.s32 %p108, %r301, %r307;
575
+ .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
576
+ and.pred %p109, %p105, %p107;
577
+ .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
578
+ or.pred %p110, %p106, %p108;
579
+ or.pred %p111, %p103, %p109;
580
+ .loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
581
+ and.pred %p112, %p104, %p110;
582
+ .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
583
+ xor.b32 %r308, %r287, %r293;
584
+ xor.b32 %r309, %r289, %r295;
585
+ .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
586
+ selp.b32 %r310, %r308, 0, %p111;
587
+ selp.b32 %r311, %r309, 0, %p112;
588
+ .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
589
+ xor.b32 %r312, %r310, %r276;
590
+ xor.b32 %r313, %r311, %r277;
591
+ .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
592
+ xor.b32 %r314, %r305, %r299;
593
+ xor.b32 %r315, %r307, %r301;
594
+ .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
595
+ selp.b32 %r316, %r314, 0, %p111;
596
+ selp.b32 %r317, %r315, 0, %p112;
597
+ .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
598
+ xor.b32 %r318, %r316, %r282;
599
+ xor.b32 %r319, %r317, %r283;
600
+ .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
601
+ mul.lo.s32 %r320, %r312, %r130;
602
+ mul.lo.s32 %r321, %r313, %r130;
603
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
604
+ shfl.sync.bfly.b32 %r322, %r320, 8, 31, -1;
605
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
606
+ add.s32 %r323, %r320, %r322;
607
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
608
+ shfl.sync.bfly.b32 %r324, %r321, 8, 31, -1;
609
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
610
+ add.s32 %r325, %r321, %r324;
611
+ .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
612
+ mul.lo.s32 %r326, %r312, %r127;
613
+ mul.lo.s32 %r327, %r313, %r127;
614
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
615
+ shfl.sync.bfly.b32 %r328, %r326, 8, 31, -1;
616
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
617
+ add.s32 %r329, %r326, %r328;
618
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
619
+ shfl.sync.bfly.b32 %r330, %r327, 8, 31, -1;
620
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
621
+ add.s32 %r331, %r327, %r330;
622
+ .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
623
+ mul.lo.s32 %r332, %r318, %r130;
624
+ mul.lo.s32 %r333, %r319, %r130;
625
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
626
+ shfl.sync.bfly.b32 %r334, %r332, 8, 31, -1;
627
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
628
+ add.s32 %r335, %r332, %r334;
629
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
630
+ shfl.sync.bfly.b32 %r336, %r333, 8, 31, -1;
631
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
632
+ add.s32 %r337, %r333, %r336;
633
+ .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
634
+ mul.lo.s32 %r338, %r318, %r127;
635
+ mul.lo.s32 %r339, %r319, %r127;
636
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
637
+ shfl.sync.bfly.b32 %r340, %r338, 8, 31, -1;
638
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
639
+ add.s32 %r341, %r340, %r338;
640
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
641
+ shfl.sync.bfly.b32 %r342, %r339, 8, 31, -1;
642
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
643
+ add.s32 %r343, %r342, %r339;
644
+ .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
645
+ setp.lt.s32 %p113, %r323, %r329;
646
+ setp.ge.s32 %p114, %r325, %r331;
647
+ .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
648
+ setp.eq.b32 %p115, %r323, %r329;
649
+ setp.ne.b32 %p116, %r325, %r331;
650
+ .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
651
+ setp.gt.s32 %p117, %r335, %r341;
652
+ setp.le.s32 %p118, %r337, %r343;
653
+ .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
654
+ and.pred %p119, %p115, %p117;
655
+ .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
656
+ or.pred %p120, %p116, %p118;
657
+ or.pred %p121, %p113, %p119;
658
+ .loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
659
+ and.pred %p122, %p114, %p120;
660
+ .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
661
+ xor.b32 %r344, %r323, %r329;
662
+ xor.b32 %r345, %r325, %r331;
663
+ .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
664
+ selp.b32 %r346, %r344, 0, %p121;
665
+ selp.b32 %r347, %r345, 0, %p122;
666
+ .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
667
+ xor.b32 %r348, %r346, %r312;
668
+ xor.b32 %r349, %r347, %r313;
669
+ .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
670
+ xor.b32 %r350, %r341, %r335;
671
+ xor.b32 %r351, %r343, %r337;
672
+ .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
673
+ selp.b32 %r352, %r350, 0, %p121;
674
+ selp.b32 %r353, %r351, 0, %p122;
675
+ .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
676
+ xor.b32 %r354, %r352, %r318;
677
+ xor.b32 %r355, %r353, %r319;
678
+ .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
679
+ setp.lt.s32 %p123, %r348, %r349;
680
+ .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
681
+ setp.eq.b32 %p124, %r348, %r349;
682
+ .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
683
+ setp.gt.s32 %p125, %r354, %r355;
684
+ .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
685
+ and.pred %p126, %p124, %p125;
686
+ .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
687
+ or.pred %p127, %p123, %p126;
688
+ .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
689
+ xor.b32 %r356, %r349, %r348;
690
+ .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
691
+ selp.b32 %r357, %r356, 0, %p127;
692
+ .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
693
+ xor.b32 %r358, %r357, %r348;
694
+ xor.b32 %r359, %r357, %r349;
695
+ .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
696
+ xor.b32 %r360, %r355, %r354;
697
+ .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
698
+ selp.b32 %r361, %r360, 0, %p127;
699
+ .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
700
+ xor.b32 %r362, %r361, %r354;
701
+ xor.b32 %r363, %r361, %r355;
702
+ .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
703
+ mul.lo.s32 %r52, %r358, %r132;
704
+ mul.lo.s32 %r54, %r359, %r132;
705
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
706
+ bar.sync 0;
707
+ // begin inline asm
708
+ @%p3 st.shared.b32 [ %r3 + 0 ], %r52;
709
+ // end inline asm
710
+ // begin inline asm
711
+ @%p3 st.shared.b32 [ %r5 + 0 ], %r54;
712
+ // end inline asm
713
+ bar.sync 0;
714
+ // begin inline asm
715
+ @%p5 ld.shared.b32 %r55, [ %r8 + 0 ];
716
+ // end inline asm
717
+ shfl.sync.bfly.b32 %r364, %r55, 1, 31, -1;
718
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
719
+ add.s32 %r58, %r364, %r55;
720
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
721
+ // begin inline asm
722
+ @%p6 st.shared.b32 [ %r8 + 0 ], %r58;
723
+ // end inline asm
724
+ // begin inline asm
725
+ @%p5 ld.shared.b32 %r59, [ %r12 + 0 ];
726
+ // end inline asm
727
+ shfl.sync.bfly.b32 %r365, %r59, 1, 31, -1;
728
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
729
+ add.s32 %r62, %r365, %r59;
730
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
731
+ // begin inline asm
732
+ @%p6 st.shared.b32 [ %r12 + 0 ], %r62;
733
+ // end inline asm
734
+ bar.sync 0;
735
+ ld.shared.b32 %r366, [%r252];
736
+ ld.shared.b32 %r367, [%r252+8];
737
+ .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
738
+ mul.lo.s32 %r64, %r358, %r129;
739
+ mul.lo.s32 %r66, %r359, %r129;
740
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
741
+ bar.sync 0;
742
+ // begin inline asm
743
+ @%p3 st.shared.b32 [ %r3 + 0 ], %r64;
744
+ // end inline asm
745
+ // begin inline asm
746
+ @%p3 st.shared.b32 [ %r5 + 0 ], %r66;
747
+ // end inline asm
748
+ bar.sync 0;
749
+ // begin inline asm
750
+ @%p5 ld.shared.b32 %r67, [ %r8 + 0 ];
751
+ // end inline asm
752
+ shfl.sync.bfly.b32 %r368, %r67, 1, 31, -1;
753
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
754
+ add.s32 %r70, %r368, %r67;
755
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
756
+ // begin inline asm
757
+ @%p6 st.shared.b32 [ %r8 + 0 ], %r70;
758
+ // end inline asm
759
+ // begin inline asm
760
+ @%p5 ld.shared.b32 %r71, [ %r12 + 0 ];
761
+ // end inline asm
762
+ shfl.sync.bfly.b32 %r369, %r71, 1, 31, -1;
763
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
764
+ add.s32 %r74, %r369, %r71;
765
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
766
+ // begin inline asm
767
+ @%p6 st.shared.b32 [ %r12 + 0 ], %r74;
768
+ // end inline asm
769
+ bar.sync 0;
770
+ ld.shared.b32 %r370, [%r252];
771
+ ld.shared.b32 %r371, [%r252+8];
772
+ .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
773
+ mul.lo.s32 %r76, %r362, %r132;
774
+ mul.lo.s32 %r78, %r363, %r132;
775
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
776
+ bar.sync 0;
777
+ // begin inline asm
778
+ @%p3 st.shared.b32 [ %r3 + 0 ], %r76;
779
+ // end inline asm
780
+ // begin inline asm
781
+ @%p3 st.shared.b32 [ %r5 + 0 ], %r78;
782
+ // end inline asm
783
+ bar.sync 0;
784
+ // begin inline asm
785
+ @%p5 ld.shared.b32 %r79, [ %r8 + 0 ];
786
+ // end inline asm
787
+ shfl.sync.bfly.b32 %r372, %r79, 1, 31, -1;
788
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
789
+ add.s32 %r82, %r372, %r79;
790
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
791
+ // begin inline asm
792
+ @%p6 st.shared.b32 [ %r8 + 0 ], %r82;
793
+ // end inline asm
794
+ // begin inline asm
795
+ @%p5 ld.shared.b32 %r83, [ %r12 + 0 ];
796
+ // end inline asm
797
+ shfl.sync.bfly.b32 %r373, %r83, 1, 31, -1;
798
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
799
+ add.s32 %r86, %r373, %r83;
800
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
801
+ // begin inline asm
802
+ @%p6 st.shared.b32 [ %r12 + 0 ], %r86;
803
+ // end inline asm
804
+ bar.sync 0;
805
+ ld.shared.b32 %r374, [%r252];
806
+ ld.shared.b32 %r375, [%r252+8];
807
+ .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
808
+ mul.lo.s32 %r88, %r362, %r129;
809
+ mul.lo.s32 %r90, %r363, %r129;
810
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
811
+ bar.sync 0;
812
+ // begin inline asm
813
+ @%p3 st.shared.b32 [ %r3 + 0 ], %r88;
814
+ // end inline asm
815
+ // begin inline asm
816
+ @%p3 st.shared.b32 [ %r5 + 0 ], %r90;
817
+ // end inline asm
818
+ bar.sync 0;
819
+ // begin inline asm
820
+ @%p5 ld.shared.b32 %r91, [ %r8 + 0 ];
821
+ // end inline asm
822
+ shfl.sync.bfly.b32 %r376, %r91, 1, 31, -1;
823
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
824
+ add.s32 %r94, %r376, %r91;
825
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
826
+ // begin inline asm
827
+ @%p6 st.shared.b32 [ %r8 + 0 ], %r94;
828
+ // end inline asm
829
+ // begin inline asm
830
+ @%p5 ld.shared.b32 %r95, [ %r12 + 0 ];
831
+ // end inline asm
832
+ shfl.sync.bfly.b32 %r377, %r95, 1, 31, -1;
833
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
834
+ add.s32 %r98, %r377, %r95;
835
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
836
+ // begin inline asm
837
+ @%p6 st.shared.b32 [ %r12 + 0 ], %r98;
838
+ // end inline asm
839
+ bar.sync 0;
840
+ ld.shared.b32 %r378, [%r252];
841
+ ld.shared.b32 %r379, [%r252+8];
842
+ .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
843
+ setp.lt.s32 %p128, %r366, %r370;
844
+ setp.lt.s32 %p129, %r367, %r371;
845
+ .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
846
+ setp.eq.b32 %p130, %r366, %r370;
847
+ setp.eq.b32 %p131, %r367, %r371;
848
+ .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
849
+ setp.gt.s32 %p132, %r374, %r378;
850
+ setp.gt.s32 %p133, %r375, %r379;
851
+ .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
852
+ and.pred %p134, %p130, %p132;
853
+ and.pred %p135, %p131, %p133;
854
+ .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
855
+ or.pred %p136, %p128, %p134;
856
+ or.pred %p137, %p129, %p135;
857
+ .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
858
+ xor.b32 %r380, %r370, %r366;
859
+ xor.b32 %r381, %r371, %r367;
860
+ .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
861
+ selp.b32 %r382, %r380, 0, %p136;
862
+ selp.b32 %r383, %r381, 0, %p137;
863
+ .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
864
+ xor.b32 %r384, %r382, %r358;
865
+ xor.b32 %r385, %r383, %r359;
866
+ .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
867
+ xor.b32 %r386, %r378, %r374;
868
+ xor.b32 %r387, %r379, %r375;
869
+ .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
870
+ selp.b32 %r388, %r386, 0, %p136;
871
+ selp.b32 %r389, %r387, 0, %p137;
872
+ .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
873
+ xor.b32 %r390, %r388, %r362;
874
+ xor.b32 %r391, %r389, %r363;
875
+ .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
876
+ mul.lo.s32 %r392, %r384, %r131;
877
+ mul.lo.s32 %r393, %r385, %r131;
878
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
879
+ shfl.sync.bfly.b32 %r394, %r392, 16, 31, -1;
880
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
881
+ add.s32 %r395, %r392, %r394;
882
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
883
+ shfl.sync.bfly.b32 %r396, %r393, 16, 31, -1;
884
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
885
+ add.s32 %r397, %r393, %r396;
886
+ .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
887
+ mul.lo.s32 %r398, %r384, %r124;
888
+ mul.lo.s32 %r399, %r385, %r124;
889
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
890
+ shfl.sync.bfly.b32 %r400, %r398, 16, 31, -1;
891
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
892
+ add.s32 %r401, %r398, %r400;
893
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
894
+ shfl.sync.bfly.b32 %r402, %r399, 16, 31, -1;
895
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
896
+ add.s32 %r403, %r399, %r402;
897
+ .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
898
+ mul.lo.s32 %r404, %r390, %r131;
899
+ mul.lo.s32 %r405, %r391, %r131;
900
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
901
+ shfl.sync.bfly.b32 %r406, %r404, 16, 31, -1;
902
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
903
+ add.s32 %r407, %r404, %r406;
904
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
905
+ shfl.sync.bfly.b32 %r408, %r405, 16, 31, -1;
906
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
907
+ add.s32 %r409, %r405, %r408;
908
+ .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
909
+ mul.lo.s32 %r410, %r390, %r124;
910
+ mul.lo.s32 %r411, %r391, %r124;
911
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
912
+ shfl.sync.bfly.b32 %r412, %r410, 16, 31, -1;
913
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
914
+ add.s32 %r413, %r412, %r410;
915
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
916
+ shfl.sync.bfly.b32 %r414, %r411, 16, 31, -1;
917
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
918
+ add.s32 %r415, %r414, %r411;
919
+ .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
920
+ setp.lt.s32 %p138, %r395, %r401;
921
+ setp.lt.s32 %p139, %r397, %r403;
922
+ .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
923
+ setp.eq.b32 %p140, %r395, %r401;
924
+ setp.eq.b32 %p141, %r397, %r403;
925
+ .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
926
+ setp.gt.s32 %p142, %r407, %r413;
927
+ setp.gt.s32 %p143, %r409, %r415;
928
+ .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
929
+ and.pred %p144, %p140, %p142;
930
+ and.pred %p145, %p141, %p143;
931
+ .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
932
+ or.pred %p146, %p138, %p144;
933
+ or.pred %p147, %p139, %p145;
934
+ .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
935
+ xor.b32 %r416, %r395, %r401;
936
+ xor.b32 %r417, %r397, %r403;
937
+ .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
938
+ selp.b32 %r418, %r416, 0, %p146;
939
+ selp.b32 %r419, %r417, 0, %p147;
940
+ .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
941
+ xor.b32 %r420, %r418, %r384;
942
+ xor.b32 %r421, %r419, %r385;
943
+ .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
944
+ xor.b32 %r422, %r413, %r407;
945
+ xor.b32 %r423, %r415, %r409;
946
+ .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
947
+ selp.b32 %r424, %r422, 0, %p146;
948
+ selp.b32 %r425, %r423, 0, %p147;
949
+ .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
950
+ xor.b32 %r426, %r424, %r390;
951
+ xor.b32 %r427, %r425, %r391;
952
+ .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
953
+ mul.lo.s32 %r428, %r420, %r130;
954
+ mul.lo.s32 %r429, %r421, %r130;
955
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
956
+ shfl.sync.bfly.b32 %r430, %r428, 8, 31, -1;
957
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
958
+ add.s32 %r431, %r428, %r430;
959
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
960
+ shfl.sync.bfly.b32 %r432, %r429, 8, 31, -1;
961
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
962
+ add.s32 %r433, %r429, %r432;
963
+ .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
964
+ mul.lo.s32 %r434, %r420, %r127;
965
+ mul.lo.s32 %r435, %r421, %r127;
966
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
967
+ shfl.sync.bfly.b32 %r436, %r434, 8, 31, -1;
968
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
969
+ add.s32 %r437, %r434, %r436;
970
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
971
+ shfl.sync.bfly.b32 %r438, %r435, 8, 31, -1;
972
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
973
+ add.s32 %r439, %r435, %r438;
974
+ .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
975
+ mul.lo.s32 %r440, %r426, %r130;
976
+ mul.lo.s32 %r441, %r427, %r130;
977
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
978
+ shfl.sync.bfly.b32 %r442, %r440, 8, 31, -1;
979
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
980
+ add.s32 %r443, %r440, %r442;
981
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
982
+ shfl.sync.bfly.b32 %r444, %r441, 8, 31, -1;
983
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
984
+ add.s32 %r445, %r441, %r444;
985
+ .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
986
+ mul.lo.s32 %r446, %r426, %r127;
987
+ mul.lo.s32 %r447, %r427, %r127;
988
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
989
+ shfl.sync.bfly.b32 %r448, %r446, 8, 31, -1;
990
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
991
+ add.s32 %r449, %r448, %r446;
992
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
993
+ shfl.sync.bfly.b32 %r450, %r447, 8, 31, -1;
994
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
995
+ add.s32 %r451, %r450, %r447;
996
+ .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
997
+ setp.lt.s32 %p148, %r431, %r437;
998
+ setp.lt.s32 %p149, %r433, %r439;
999
+ .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
1000
+ setp.eq.b32 %p150, %r431, %r437;
1001
+ setp.eq.b32 %p151, %r433, %r439;
1002
+ .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
1003
+ setp.gt.s32 %p152, %r443, %r449;
1004
+ setp.gt.s32 %p153, %r445, %r451;
1005
+ .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
1006
+ xor.b32 %r452, %r449, %r443;
1007
+ xor.b32 %r453, %r451, %r445;
1008
+ .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
1009
+ selp.b32 %r454, %r452, 0, %p152;
1010
+ selp.b32 %r455, %r454, 0, %p150;
1011
+ selp.b32 %r456, %r452, %r455, %p148;
1012
+ selp.b32 %r457, %r453, 0, %p153;
1013
+ selp.b32 %r458, %r457, 0, %p151;
1014
+ selp.b32 %r459, %r453, %r458, %p149;
1015
+ .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
1016
+ xor.b32 %r460, %r456, %r426;
1017
+ xor.b32 %r461, %r459, %r427;
1018
+ $L__tmp2:
1019
+ .loc 1 44 34 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:44:34
1020
+ selp.b32 %r462, %r1, 0, %p1;
1021
+ cvt.s64.s32 %rd11, %r462;
1022
+ selp.b32 %r463, %r2, 0, %p1;
1023
+ cvt.s64.s32 %rd12, %r463;
1024
+ $L__tmp3:
1025
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
1026
+ bar.sync 0;
1027
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
1028
+ add.s64 %rd13, %rd12, %rd11;
1029
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
1030
+ mov.b64 {_, %r464}, %rd13;
1031
+ cvt.u32.u64 %r465, %rd13;
1032
+ shfl.sync.bfly.b32 %r466, %r465, 16, 31, -1;
1033
+ shfl.sync.bfly.b32 %r467, %r464, 16, 31, -1;
1034
+ cvt.u64.u32 %rd14, %r466;
1035
+ cvt.u64.u32 %rd15, %r467;
1036
+ shl.b64 %rd16, %rd15, 32;
1037
+ or.b64 %rd17, %rd14, %rd16;
1038
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
1039
+ add.s64 %rd18, %rd13, %rd17;
1040
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
1041
+ mov.b64 {_, %r468}, %rd18;
1042
+ cvt.u32.u64 %r469, %rd18;
1043
+ shfl.sync.bfly.b32 %r470, %r469, 8, 31, -1;
1044
+ shfl.sync.bfly.b32 %r471, %r468, 8, 31, -1;
1045
+ cvt.u64.u32 %rd19, %r470;
1046
+ cvt.u64.u32 %rd20, %r471;
1047
+ shl.b64 %rd21, %rd20, 32;
1048
+ or.b64 %rd22, %rd19, %rd21;
1049
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
1050
+ add.s64 %rd3, %rd18, %rd22;
1051
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
1052
+ setp.eq.b32 %p51, %r108, 0;
1053
+ shl.b32 %r472, %r109, 4;
1054
+ add.s32 %r473, %r251, %r472;
1055
+ shl.b32 %r474, %r129, 3;
1056
+ add.s32 %r99, %r473, %r474;
1057
+ // begin inline asm
1058
+ @%p51 st.shared.b64 [ %r99 + 0 ], %rd3;
1059
+ // end inline asm
1060
+ bar.sync 0;
1061
+ setp.lt.u32 %p52, %r107, 16;
1062
+ shl.b32 %r475, %r107, 3;
1063
+ add.s32 %r100, %r251, %r475;
1064
+ // begin inline asm
1065
+ @%p52 ld.shared.b64 %rd4, [ %r100 + 0 ];
1066
+ // end inline asm
1067
+ mov.b64 {_, %r476}, %rd4;
1068
+ cvt.u32.u64 %r477, %rd4;
1069
+ shfl.sync.bfly.b32 %r478, %r477, 1, 31, -1;
1070
+ shfl.sync.bfly.b32 %r479, %r476, 1, 31, -1;
1071
+ cvt.u64.u32 %rd23, %r478;
1072
+ cvt.u64.u32 %rd24, %r479;
1073
+ shl.b64 %rd25, %rd24, 32;
1074
+ or.b64 %rd26, %rd23, %rd25;
1075
+ .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
1076
+ add.s64 %rd5, %rd4, %rd26;
1077
+ .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
1078
+ and.b32 %r480, %r107, 1009;
1079
+ setp.eq.b32 %p53, %r480, 0;
1080
+ // begin inline asm
1081
+ @%p53 st.shared.b64 [ %r100 + 0 ], %rd5;
1082
+ // end inline asm
1083
+ bar.sync 0;
1084
+ ld.shared.b32 %r104, [%r473];
1085
+ $L__tmp4:
1086
+ .loc 1 49 35 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:35
1087
+ shl.b32 %r481, %r112, 4;
1088
+ .loc 1 49 32 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:32
1089
+ or.b32 %r482, %r481, %r114;
1090
+ .loc 1 49 25 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:25
1091
+ mad.wide.s32 %rd6, %r482, 4, %rd9;
1092
+ .loc 1 49 47 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:47
1093
+ bar.sync 0;
1094
+ and.b32 %r483, %r107, 3;
1095
+ shl.b32 %r484, %r483, 3;
1096
+ and.b32 %r485, %r247, 96;
1097
+ bfe.s32 %r486, %r107, 2, 1;
1098
+ and.b32 %r487, %r486, 192;
1099
+ and.b32 %r488, %r125, 260;
1100
+ or.b32 %r489, %r484, %r485;
1101
+ xor.b32 %r490, %r489, %r487;
1102
+ or.b32 %r491, %r490, %r488;
1103
+ add.s32 %r492, %r251, %r491;
1104
+ st.shared.b32 [%r492], %r460;
1105
+ xor.b32 %r493, %r491, 4;
1106
+ add.s32 %r494, %r251, %r493;
1107
+ st.shared.b32 [%r494], %r461;
1108
+ bar.sync 0;
1109
+ shl.b32 %r495, %r483, 5;
1110
+ and.b32 %r496, %r107, 28;
1111
+ and.b32 %r497, %r128, 192;
1112
+ or.b32 %r498, %r495, %r496;
1113
+ xor.b32 %r499, %r498, %r497;
1114
+ add.s32 %r500, %r251, %r499;
1115
+ ld.shared.b32 %r102, [%r500];
1116
+ xor.b32 %r501, %r499, 4;
1117
+ add.s32 %r502, %r251, %r501;
1118
+ ld.shared.b32 %r103, [%r502+256];
1119
+ // begin inline asm
1120
+ @%p54 st.global.v2.b32 [ %rd6 + 0 ], { %r102, %r103 };
1121
+ // end inline asm
1122
+ .loc 1 50 25 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:25
1123
+ mad.wide.s32 %rd7, %r111, 4, %rd10;
1124
+ .loc 1 50 37 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:37
1125
+ and.b32 %r503, %r107, 56;
1126
+ setp.eq.b32 %p154, %r503, 0;
1127
+ and.pred %p55, %p154, %p1;
1128
+ // begin inline asm
1129
+ @%p55 st.global.b32 [ %rd7 + 0 ], { %r104 };
1130
+ // end inline asm
1131
+ .loc 1 50 4 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:4
1132
+ ret;
1133
+ $L__tmp5:
1134
+ $L__func_end0:
1135
+ // -- End function
1136
+ }
1137
+ .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py"
1138
+ .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
1139
+ .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py"
1140
+ .section .debug_abbrev
1141
+ {
1142
+ .b8 1 // Abbreviation Code
1143
+ .b8 17 // DW_TAG_compile_unit
1144
+ .b8 1 // DW_CHILDREN_yes
1145
+ .b8 37 // DW_AT_producer
1146
+ .b8 8 // DW_FORM_string
1147
+ .b8 19 // DW_AT_language
1148
+ .b8 5 // DW_FORM_data2
1149
+ .b8 3 // DW_AT_name
1150
+ .b8 8 // DW_FORM_string
1151
+ .b8 16 // DW_AT_stmt_list
1152
+ .b8 6 // DW_FORM_data4
1153
+ .b8 27 // DW_AT_comp_dir
1154
+ .b8 8 // DW_FORM_string
1155
+ .b8 0 // EOM(1)
1156
+ .b8 0 // EOM(2)
1157
+ .b8 2 // Abbreviation Code
1158
+ .b8 46 // DW_TAG_subprogram
1159
+ .b8 0 // DW_CHILDREN_no
1160
+ .b8 3 // DW_AT_name
1161
+ .b8 8 // DW_FORM_string
1162
+ .b8 32 // DW_AT_inline
1163
+ .b8 11 // DW_FORM_data1
1164
+ .b8 0 // EOM(1)
1165
+ .b8 0 // EOM(2)
1166
+ .b8 3 // Abbreviation Code
1167
+ .b8 46 // DW_TAG_subprogram
1168
+ .b8 1 // DW_CHILDREN_yes
1169
+ .b8 17 // DW_AT_low_pc
1170
+ .b8 1 // DW_FORM_addr
1171
+ .b8 18 // DW_AT_high_pc
1172
+ .b8 1 // DW_FORM_addr
1173
+ .b8 49 // DW_AT_abstract_origin
1174
+ .b8 19 // DW_FORM_ref4
1175
+ .b8 0 // EOM(1)
1176
+ .b8 0 // EOM(2)
1177
+ .b8 4 // Abbreviation Code
1178
+ .b8 29 // DW_TAG_inlined_subroutine
1179
+ .b8 0 // DW_CHILDREN_no
1180
+ .b8 49 // DW_AT_abstract_origin
1181
+ .b8 19 // DW_FORM_ref4
1182
+ .b8 17 // DW_AT_low_pc
1183
+ .b8 1 // DW_FORM_addr
1184
+ .b8 18 // DW_AT_high_pc
1185
+ .b8 1 // DW_FORM_addr
1186
+ .b8 88 // DW_AT_call_file
1187
+ .b8 11 // DW_FORM_data1
1188
+ .b8 89 // DW_AT_call_line
1189
+ .b8 11 // DW_FORM_data1
1190
+ .b8 87 // DW_AT_call_column
1191
+ .b8 11 // DW_FORM_data1
1192
+ .b8 0 // EOM(1)
1193
+ .b8 0 // EOM(2)
1194
+ .b8 0 // EOM(3)
1195
+ }
1196
+ .section .debug_info
1197
+ {
1198
+ .b32 267 // Length of Unit
1199
+ .b8 2 // DWARF version number
1200
+ .b8 0
1201
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
1202
+ .b8 8 // Address Size (in bytes)
1203
+ .b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit
1204
+ .b8 116 // DW_AT_producer
1205
+ .b8 114
1206
+ .b8 105
1207
+ .b8 116
1208
+ .b8 111
1209
+ .b8 110
1210
+ .b8 0
1211
+ .b8 2 // DW_AT_language
1212
+ .b8 0
1213
+ .b8 99 // DW_AT_name
1214
+ .b8 104
1215
+ .b8 120
1216
+ .b8 110
1217
+ .b8 121
1218
+ .b8 103
1219
+ .b8 112
1220
+ .b8 118
1221
+ .b8 112
1222
+ .b8 109
1223
+ .b8 118
1224
+ .b8 114
1225
+ .b8 50
1226
+ .b8 109
1227
+ .b8 120
1228
+ .b8 50
1229
+ .b8 101
1230
+ .b8 54
1231
+ .b8 109
1232
+ .b8 119
1233
+ .b8 103
1234
+ .b8 100
1235
+ .b8 101
1236
+ .b8 111
1237
+ .b8 106
1238
+ .b8 116
1239
+ .b8 104
1240
+ .b8 114
1241
+ .b8 105
1242
+ .b8 114
1243
+ .b8 110
1244
+ .b8 111
1245
+ .b8 103
1246
+ .b8 55
1247
+ .b8 110
1248
+ .b8 109
1249
+ .b8 113
1250
+ .b8 54
1251
+ .b8 109
1252
+ .b8 99
1253
+ .b8 115
1254
+ .b8 105
1255
+ .b8 51
1256
+ .b8 119
1257
+ .b8 118
1258
+ .b8 101
1259
+ .b8 103
1260
+ .b8 118
1261
+ .b8 105
1262
+ .b8 50
1263
+ .b8 115
1264
+ .b8 111
1265
+ .b8 46
1266
+ .b8 112
1267
+ .b8 121
1268
+ .b8 0
1269
+ .b32 .debug_line // DW_AT_stmt_list
1270
+ .b8 47 // DW_AT_comp_dir
1271
+ .b8 119
1272
+ .b8 111
1273
+ .b8 114
1274
+ .b8 107
1275
+ .b8 115
1276
+ .b8 112
1277
+ .b8 97
1278
+ .b8 99
1279
+ .b8 101
1280
+ .b8 47
1281
+ .b8 104
1282
+ .b8 97
1283
+ .b8 110
1284
+ .b8 114
1285
+ .b8 117
1286
+ .b8 105
1287
+ .b8 47
1288
+ .b8 83
1289
+ .b8 112
1290
+ .b8 101
1291
+ .b8 99
1292
+ .b8 70
1293
+ .b8 111
1294
+ .b8 114
1295
+ .b8 103
1296
+ .b8 101
1297
+ .b8 45
1298
+ .b8 101
1299
+ .b8 120
1300
+ .b8 116
1301
+ .b8 47
1302
+ .b8 99
1303
+ .b8 97
1304
+ .b8 99
1305
+ .b8 104
1306
+ .b8 101
1307
+ .b8 47
1308
+ .b8 99
1309
+ .b8 111
1310
+ .b8 109
1311
+ .b8 112
1312
+ .b8 105
1313
+ .b8 108
1314
+ .b8 101
1315
+ .b8 100
1316
+ .b8 95
1317
+ .b8 107
1318
+ .b8 101
1319
+ .b8 114
1320
+ .b8 110
1321
+ .b8 101
1322
+ .b8 108
1323
+ .b8 115
1324
+ .b8 47
1325
+ .b8 104
1326
+ .b8 120
1327
+ .b8 0
1328
+ .b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram
1329
+ .b8 116 // DW_AT_name
1330
+ .b8 114
1331
+ .b8 105
1332
+ .b8 116
1333
+ .b8 111
1334
+ .b8 110
1335
+ .b8 95
1336
+ .b8 112
1337
+ .b8 101
1338
+ .b8 114
1339
+ .b8 95
1340
+ .b8 102
1341
+ .b8 117
1342
+ .b8 115
1343
+ .b8 101
1344
+ .b8 100
1345
+ .b8 95
1346
+ .b8 95
1347
+ .b8 116
1348
+ .b8 111
1349
+ .b8 95
1350
+ .b8 99
1351
+ .b8 111
1352
+ .b8 112
1353
+ .b8 121
1354
+ .b8 95
1355
+ .b8 99
1356
+ .b8 108
1357
+ .b8 111
1358
+ .b8 110
1359
+ .b8 101
1360
+ .b8 95
1361
+ .b8 115
1362
+ .b8 108
1363
+ .b8 105
1364
+ .b8 99
1365
+ .b8 101
1366
+ .b8 95
1367
+ .b8 115
1368
+ .b8 111
1369
+ .b8 114
1370
+ .b8 116
1371
+ .b8 95
1372
+ .b8 115
1373
+ .b8 117
1374
+ .b8 109
1375
+ .b8 95
1376
+ .b8 116
1377
+ .b8 114
1378
+ .b8 97
1379
+ .b8 110
1380
+ .b8 115
1381
+ .b8 112
1382
+ .b8 111
1383
+ .b8 115
1384
+ .b8 101
1385
+ .b8 95
1386
+ .b8 51
1387
+ .b8 0
1388
+ .b8 1 // DW_AT_inline
1389
+ .b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram
1390
+ .b64 $L__func_begin0 // DW_AT_low_pc
1391
+ .b64 $L__func_end0 // DW_AT_high_pc
1392
+ .b32 139 // DW_AT_abstract_origin
1393
+ .b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine
1394
+ .b32 139 // DW_AT_abstract_origin
1395
+ .b64 $L__tmp1 // DW_AT_low_pc
1396
+ .b64 $L__tmp2 // DW_AT_high_pc
1397
+ .b8 1 // DW_AT_call_file
1398
+ .b8 41 // DW_AT_call_line
1399
+ .b8 67 // DW_AT_call_column
1400
+ .b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine
1401
+ .b32 139 // DW_AT_abstract_origin
1402
+ .b64 $L__tmp3 // DW_AT_low_pc
1403
+ .b64 $L__tmp4 // DW_AT_high_pc
1404
+ .b8 1 // DW_AT_call_file
1405
+ .b8 45 // DW_AT_call_line
1406
+ .b8 26 // DW_AT_call_column
1407
+ .b8 0 // End Of Children Mark
1408
+ .b8 0 // End Of Children Mark
1409
+ }
1410
+ .section .debug_macinfo { }
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source ADDED
The diff for this file is too large to render. See raw diff
 
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir ADDED
@@ -0,0 +1,841 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 2], order = [0, 1]}>
2
+ #blocked1 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [2, 1], order = [1, 0]}>
3
+ #linear = #ttg.linear<{register = [[0, 8]], lane = [[1, 0], [2, 0], [4, 0], [0, 1], [0, 2]], warp = [[0, 4]], block = []}>
4
+ #linear1 = #ttg.linear<{register = [[4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [0, 1, 0], [1, 0, 0]], warp = [[2, 0, 0]], block = []}>
5
+ #linear2 = #ttg.linear<{register = [[2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [0, 0, 1], [0, 1, 0]], warp = [[1, 0, 0]], block = []}>
6
+ #linear3 = #ttg.linear<{register = [[1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [0, 0, 1], [0, 0, 2]], warp = [[0, 1, 0]], block = []}>
7
+ #linear4 = #ttg.linear<{register = [[0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [0, 0, 1], [0, 0, 2]], warp = [[0, 0, 4]], block = []}>
8
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":18:0)
9
+ #loc1 = loc(unknown)
10
+ #loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12)
11
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":41:67)
12
+ #loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73)
13
+ #loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51)
14
+ #loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53)
15
+ #loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50)
16
+ #loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51)
17
+ #loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:26)
18
+ #loc77 = loc("in_ptr0"(#loc))
19
+ #loc78 = loc("out_ptr2"(#loc))
20
+ #loc79 = loc("out_ptr3"(#loc))
21
+ #loc80 = loc("xnumel"(#loc))
22
+ #loc81 = loc("r0_numel"(#loc))
23
+ #loc99 = loc(callsite(#loc19 at #loc20))
24
+ #loc105 = loc("ileft"(#loc28))
25
+ #loc109 = loc("iright"(#loc33))
26
+ #loc118 = loc("left_idx"(#loc42))
27
+ #loc123 = loc("right_idx"(#loc47))
28
+ #loc143 = loc("tmp11"(#loc67))
29
+ #loc149 = loc(callsite(#loc24 at #loc99))
30
+ #loc153 = loc(callsite(#loc1 at #loc143))
31
+ #loc157 = loc(callsite(#loc105 at #loc149))
32
+ #loc161 = loc(callsite(#loc109 at #loc149))
33
+ #loc169 = loc(callsite(#loc118 at #loc149))
34
+ #loc174 = loc(callsite(#loc123 at #loc149))
35
+ #loc194 = loc(callsite(#loc1 at #loc157))
36
+ #loc196 = loc(callsite(#loc1 at #loc161))
37
+ #loc199 = loc(callsite(#loc1 at #loc169))
38
+ #loc202 = loc(callsite(#loc1 at #loc174))
39
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
40
+ tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
41
+ %cst = arith.constant dense<0> : tensor<8x16xi32, #linear> loc(#loc1)
42
+ %cst_0 = arith.constant dense<0> : tensor<8x16xi64, #blocked> loc(#loc1)
43
+ %c8_i32 = arith.constant 8 : i32 loc(#loc1)
44
+ %cst_1 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1)
45
+ %cst_2 = arith.constant dense<32> : tensor<8x1xi32, #blocked1> loc(#loc1)
46
+ %cst_3 = arith.constant dense<16> : tensor<8x1xi32, #blocked> loc(#loc1)
47
+ %cst_4 = arith.constant dense<16> : tensor<8x1xi32, #blocked1> loc(#loc1)
48
+ %cst_5 = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1)
49
+ %cst_6 = arith.constant dense<272> : tensor<8x1xi32, #blocked> loc(#loc1)
50
+ %cst_7 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1)
51
+ %cst_8 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1)
52
+ %cst_9 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1)
53
+ %cst_10 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1)
54
+ %cst_11 = arith.constant dense<0> : tensor<8x16xi32, #blocked> loc(#loc1)
55
+ %xoffset = tt.get_program_id x : i32 loc(#loc82)
56
+ %xoffset_12 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc83)
57
+ %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc84)
58
+ %xindex_13 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc84)
59
+ %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc84)
60
+ %xindex_15 = tt.expand_dims %xindex_13 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc84)
61
+ %xindex_16 = tt.splat %xoffset_12 : i32 -> tensor<8x1xi32, #blocked> loc(#loc85)
62
+ %xindex_17 = tt.splat %xoffset_12 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc85)
63
+ %xindex_18 = arith.addi %xindex_16, %xindex_14 : tensor<8x1xi32, #blocked> loc(#loc85)
64
+ %xindex_19 = arith.addi %xindex_17, %xindex_15 : tensor<8x1xi32, #blocked1> loc(#loc85)
65
+ %xmask = arith.cmpi slt, %xindex_18, %cst_1 : tensor<8x1xi32, #blocked> loc(#loc86)
66
+ %xmask_20 = arith.cmpi slt, %xindex_19, %cst_2 : tensor<8x1xi32, #blocked1> loc(#loc86)
67
+ %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc87)
68
+ %r0_index_21 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc87)
69
+ %r0_index_22 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc87)
70
+ %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc87)
71
+ %r0_index_24 = tt.expand_dims %r0_index_21 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc87)
72
+ %r0_index_25 = tt.expand_dims %r0_index_22 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc87)
73
+ %x0 = arith.remsi %xindex_18, %cst_3 : tensor<8x1xi32, #blocked> loc(#loc88)
74
+ %x1 = arith.divsi %xindex_18, %cst_3 : tensor<8x1xi32, #blocked> loc(#loc89)
75
+ %tmp0 = arith.muli %r0_index_23, %cst_5 : tensor<1x16xi32, #blocked> loc(#loc90)
76
+ %tmp0_26 = tt.broadcast %x0 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc91)
77
+ %tmp0_27 = tt.broadcast %tmp0 : tensor<1x16xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc91)
78
+ %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<8x16xi32, #blocked> loc(#loc91)
79
+ %tmp0_29 = arith.muli %x1, %cst_6 : tensor<8x1xi32, #blocked> loc(#loc92)
80
+ %tmp0_30 = tt.broadcast %tmp0_29 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc93)
81
+ %tmp0_31 = arith.addi %tmp0_28, %tmp0_30 : tensor<8x16xi32, #blocked> loc(#loc93)
82
+ %tmp0_32 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<8x16x!tt.ptr<i32>, #blocked> loc(#loc94)
83
+ %tmp0_33 = tt.addptr %tmp0_32, %tmp0_31 : tensor<8x16x!tt.ptr<i32>, #blocked>, tensor<8x16xi32, #blocked> loc(#loc94)
84
+ %tmp0_34 = tt.broadcast %xmask : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc95)
85
+ %tmp0_35 = tt.broadcast %xmask_20 : tensor<8x1xi1, #blocked1> -> tensor<8x16xi1, #blocked1> loc(#loc95)
86
+ %tmp0_36 = tt.load %tmp0_33, %tmp0_34, %cst_11 : tensor<8x16x!tt.ptr<i32>, #blocked> loc(#loc95)
87
+ %tmp2 = arith.trunci %r0_index_24 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc96)
88
+ %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<8x16xi16, #linear> loc(#loc97)
89
+ %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc146)
90
+ %flip_37 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc146)
91
+ %flip_38 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc146)
92
+ %flip_39 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc146)
93
+ %flip_40 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc146)
94
+ %flip_41 = tt.expand_dims %flip_37 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc146)
95
+ %flip_42 = tt.expand_dims %flip_38 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc146)
96
+ %flip_43 = tt.expand_dims %flip_39 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc146)
97
+ %flip_44 = tt.expand_dims %flip_40 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc146)
98
+ %flip_45 = tt.expand_dims %flip_41 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc146)
99
+ %flip_46 = tt.expand_dims %flip_42 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc146)
100
+ %flip_47 = tt.expand_dims %flip_43 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc146)
101
+ %flip_48 = tt.broadcast %flip_44 : tensor<1x2x1xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc147)
102
+ %flip_49 = tt.reshape %flip_48 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #blocked> loc(#loc148)
103
+ %flip_50 = tt.reshape %flip_48 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc148)
104
+ %y = tt.reshape %tmp0_36 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #linear1> loc(#loc154)
105
+ %left_mask = arith.subi %cst_7, %flip_45 : tensor<1x2x1xi32, #linear1> loc(#loc155)
106
+ %left_mask_51 = arith.subi %cst_8, %flip_44 : tensor<1x2x1xi32, #linear2> loc(#loc155)
107
+ %left_mask_52 = arith.subi %cst_9, %flip_46 : tensor<1x2x1xi32, #linear3> loc(#loc155)
108
+ %left_mask_53 = arith.subi %cst_10, %flip_47 : tensor<1x2x1xi32, #linear4> loc(#loc155)
109
+ %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc156)
110
+ %ileft_54 = arith.muli %y, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156)
111
+ %ileft_55 = "tt.reduce"(%ileft_54) <{axis = 1 : i32}> ({
112
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
113
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
114
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
115
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
116
+ %ileft_56 = tt.expand_dims %ileft_55 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158)
117
+ %ileft_57 = tt.broadcast %ileft_56 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159)
118
+ %iright = tt.broadcast %flip_45 : tensor<1x2x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc160)
119
+ %iright_58 = arith.muli %y, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160)
120
+ %iright_59 = "tt.reduce"(%iright_58) <{axis = 1 : i32}> ({
121
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
122
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
123
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
124
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
125
+ %iright_60 = tt.expand_dims %iright_59 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162)
126
+ %iright_61 = tt.broadcast %iright_60 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163)
127
+ %ileft_62 = tt.reshape %ileft_57 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc164)
128
+ %ileft_63 = tt.reshape %ileft_57 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164)
129
+ %iright_64 = tt.reshape %iright_61 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc165)
130
+ %iright_65 = tt.reshape %iright_61 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165)
131
+ %y_idx = tt.reshape %tmp4 : tensor<8x16xi16, #linear> -> tensor<64x2x1xi16, #linear1> loc(#loc166)
132
+ %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc167)
133
+ %left_idx_66 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<64x2x1xi16, #linear1> loc(#loc168)
134
+ %left_idx_67 = arith.muli %y_idx, %left_idx_66 : tensor<64x2x1xi16, #linear1> loc(#loc168)
135
+ %input = arith.extsi %left_idx_67 : tensor<64x2x1xi16, #linear1> to tensor<64x2x1xi32, #linear1> loc(#loc197)
136
+ %left_idx_68 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
137
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
138
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
139
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
140
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
141
+ %left_idx_69 = tt.expand_dims %left_idx_68 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170)
142
+ %left_idx_70 = tt.broadcast %left_idx_69 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171)
143
+ %right_idx = arith.trunci %flip_45 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc172)
144
+ %right_idx_71 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<64x2x1xi16, #linear1> loc(#loc173)
145
+ %right_idx_72 = arith.muli %y_idx, %right_idx_71 : tensor<64x2x1xi16, #linear1> loc(#loc173)
146
+ %input_73 = arith.extsi %right_idx_72 : tensor<64x2x1xi16, #linear1> to tensor<64x2x1xi32, #linear1> loc(#loc200)
147
+ %right_idx_74 = "tt.reduce"(%input_73) <{axis = 1 : i32}> ({
148
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
149
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
150
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
151
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
152
+ %right_idx_75 = tt.expand_dims %right_idx_74 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175)
153
+ %right_idx_76 = tt.broadcast %right_idx_75 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176)
154
+ %left_idx_77 = tt.reshape %left_idx_70 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc177)
155
+ %left_idx_78 = tt.reshape %left_idx_70 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177)
156
+ %right_idx_79 = tt.reshape %right_idx_76 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc178)
157
+ %right_idx_80 = tt.reshape %right_idx_76 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178)
158
+ %cond = arith.cmpi slt, %ileft_62, %iright_64 : tensor<8x16xi32, #blocked> loc(#loc179)
159
+ %cond_81 = arith.cmpi slt, %ileft_63, %iright_65 : tensor<8x16xi32, #linear> loc(#loc179)
160
+ %eq = arith.cmpi eq, %ileft_62, %iright_64 : tensor<8x16xi32, #blocked> loc(#loc180)
161
+ %eq_82 = arith.cmpi eq, %ileft_63, %iright_65 : tensor<8x16xi32, #linear> loc(#loc180)
162
+ %cond_83 = arith.cmpi sgt, %left_idx_77, %right_idx_79 : tensor<8x16xi32, #blocked> loc(#loc181)
163
+ %cond_84 = arith.cmpi sgt, %left_idx_78, %right_idx_80 : tensor<8x16xi32, #linear> loc(#loc181)
164
+ %cond_85 = arith.andi %eq, %cond_83 : tensor<8x16xi1, #blocked> loc(#loc182)
165
+ %cond_86 = arith.andi %eq_82, %cond_84 : tensor<8x16xi1, #linear> loc(#loc182)
166
+ %cond_87 = arith.ori %cond, %cond_85 : tensor<8x16xi1, #blocked> loc(#loc183)
167
+ %cond_88 = arith.ori %cond_81, %cond_86 : tensor<8x16xi1, #linear> loc(#loc183)
168
+ %cond_89 = arith.extui %cond_87 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc184)
169
+ %cond_90 = arith.extui %cond_88 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
170
+ %cond_91 = arith.xori %cond_89, %flip_49 : tensor<8x16xi32, #blocked> loc(#loc184)
171
+ %cond_92 = arith.xori %cond_90, %flip_50 : tensor<8x16xi32, #linear> loc(#loc184)
172
+ %cond_93 = arith.cmpi ne, %cond_91, %cst_11 : tensor<8x16xi32, #blocked> loc(#loc185)
173
+ %cond_94 = arith.cmpi ne, %cond_92, %cst : tensor<8x16xi32, #linear> loc(#loc185)
174
+ %ret = arith.xori %ileft_62, %iright_64 : tensor<8x16xi32, #blocked> loc(#loc186)
175
+ %ret_95 = arith.select %cond_93, %ret, %cst_11 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc187)
176
+ %ret_96 = arith.xori %tmp0_36, %ret_95 : tensor<8x16xi32, #blocked> loc(#loc188)
177
+ %ret_97 = ttg.convert_layout %ret_96 : tensor<8x16xi32, #blocked> -> tensor<8x16xi32, #linear> loc(#loc188)
178
+ %new_idxs = arith.xori %left_idx_78, %right_idx_80 : tensor<8x16xi32, #linear> loc(#loc189)
179
+ %new_idxs_98 = arith.select %cond_94, %new_idxs, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
180
+ %new_idxs_99 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc191)
181
+ %new_idxs_100 = tt.broadcast %new_idxs_99 : tensor<1x16xi32, #linear> -> tensor<8x16xi32, #linear> loc(#loc191)
182
+ %new_idxs_101 = arith.xori %new_idxs_100, %new_idxs_98 : tensor<8x16xi32, #linear> loc(#loc191)
183
+ %flip_102 = tt.broadcast %flip_46 : tensor<1x2x1xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc147)
184
+ %flip_103 = tt.reshape %flip_102 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc148)
185
+ %y_104 = tt.reshape %ret_96 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #linear2> loc(#loc154)
186
+ %ileft_105 = tt.broadcast %left_mask_51 : tensor<1x2x1xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc156)
187
+ %ileft_106 = arith.muli %y_104, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc156)
188
+ %ileft_107 = "tt.reduce"(%ileft_106) <{axis = 1 : i32}> ({
189
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
190
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
191
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
192
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
193
+ %ileft_108 = tt.expand_dims %ileft_107 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc158)
194
+ %ileft_109 = tt.broadcast %ileft_108 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc159)
195
+ %iright_110 = arith.muli %y_104, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc160)
196
+ %iright_111 = "tt.reduce"(%iright_110) <{axis = 1 : i32}> ({
197
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
198
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
199
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
200
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
201
+ %iright_112 = tt.expand_dims %iright_111 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc162)
202
+ %iright_113 = tt.broadcast %iright_112 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc163)
203
+ %ileft_114 = tt.reshape %ileft_109 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc164)
204
+ %iright_115 = tt.reshape %iright_113 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc165)
205
+ %y_idx_116 = tt.reshape %new_idxs_101 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc166)
206
+ %left_idx_117 = arith.muli %y_idx_116, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc168)
207
+ %left_idx_118 = "tt.reduce"(%left_idx_117) <{axis = 1 : i32}> ({
208
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
209
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
210
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
211
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
212
+ %left_idx_119 = tt.expand_dims %left_idx_118 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc170)
213
+ %left_idx_120 = tt.broadcast %left_idx_119 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc171)
214
+ %right_idx_121 = arith.muli %y_idx_116, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc173)
215
+ %right_idx_122 = "tt.reduce"(%right_idx_121) <{axis = 1 : i32}> ({
216
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
217
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
218
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
219
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
220
+ %right_idx_123 = tt.expand_dims %right_idx_122 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc175)
221
+ %right_idx_124 = tt.broadcast %right_idx_123 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc176)
222
+ %left_idx_125 = tt.reshape %left_idx_120 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc177)
223
+ %right_idx_126 = tt.reshape %right_idx_124 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc178)
224
+ %cond_127 = arith.cmpi slt, %ileft_114, %iright_115 : tensor<8x16xi32, #linear> loc(#loc179)
225
+ %eq_128 = arith.cmpi eq, %ileft_114, %iright_115 : tensor<8x16xi32, #linear> loc(#loc180)
226
+ %cond_129 = arith.cmpi sgt, %left_idx_125, %right_idx_126 : tensor<8x16xi32, #linear> loc(#loc181)
227
+ %cond_130 = arith.andi %eq_128, %cond_129 : tensor<8x16xi1, #linear> loc(#loc182)
228
+ %cond_131 = arith.ori %cond_127, %cond_130 : tensor<8x16xi1, #linear> loc(#loc183)
229
+ %cond_132 = arith.extui %cond_131 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
230
+ %cond_133 = arith.xori %cond_132, %flip_103 : tensor<8x16xi32, #linear> loc(#loc184)
231
+ %cond_134 = arith.cmpi ne, %cond_133, %cst : tensor<8x16xi32, #linear> loc(#loc185)
232
+ %ret_135 = arith.xori %ileft_114, %iright_115 : tensor<8x16xi32, #linear> loc(#loc186)
233
+ %ret_136 = arith.select %cond_134, %ret_135, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
234
+ %ret_137 = arith.xori %ret_97, %ret_136 : tensor<8x16xi32, #linear> loc(#loc188)
235
+ %new_idxs_138 = arith.xori %left_idx_125, %right_idx_126 : tensor<8x16xi32, #linear> loc(#loc189)
236
+ %new_idxs_139 = arith.select %cond_134, %new_idxs_138, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
237
+ %new_idxs_140 = arith.xori %new_idxs_101, %new_idxs_139 : tensor<8x16xi32, #linear> loc(#loc191)
238
+ %y_141 = tt.reshape %ret_137 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc154)
239
+ %ileft_142 = arith.muli %y_141, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156)
240
+ %ileft_143 = "tt.reduce"(%ileft_142) <{axis = 1 : i32}> ({
241
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
242
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
243
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
244
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
245
+ %ileft_144 = tt.expand_dims %ileft_143 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158)
246
+ %ileft_145 = tt.broadcast %ileft_144 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159)
247
+ %iright_146 = arith.muli %y_141, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160)
248
+ %iright_147 = "tt.reduce"(%iright_146) <{axis = 1 : i32}> ({
249
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
250
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
251
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
252
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
253
+ %iright_148 = tt.expand_dims %iright_147 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162)
254
+ %iright_149 = tt.broadcast %iright_148 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163)
255
+ %ileft_150 = tt.reshape %ileft_145 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164)
256
+ %iright_151 = tt.reshape %iright_149 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165)
257
+ %y_idx_152 = tt.reshape %new_idxs_140 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc166)
258
+ %left_idx_153 = arith.muli %y_idx_152, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc168)
259
+ %left_idx_154 = "tt.reduce"(%left_idx_153) <{axis = 1 : i32}> ({
260
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
261
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
262
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
263
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
264
+ %left_idx_155 = tt.expand_dims %left_idx_154 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170)
265
+ %left_idx_156 = tt.broadcast %left_idx_155 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171)
266
+ %right_idx_157 = arith.muli %y_idx_152, %iright : tensor<64x2x1xi32, #linear1> loc(#loc173)
267
+ %right_idx_158 = "tt.reduce"(%right_idx_157) <{axis = 1 : i32}> ({
268
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
269
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
270
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
271
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
272
+ %right_idx_159 = tt.expand_dims %right_idx_158 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175)
273
+ %right_idx_160 = tt.broadcast %right_idx_159 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176)
274
+ %left_idx_161 = tt.reshape %left_idx_156 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177)
275
+ %right_idx_162 = tt.reshape %right_idx_160 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178)
276
+ %cond_163 = arith.cmpi slt, %ileft_150, %iright_151 : tensor<8x16xi32, #linear> loc(#loc179)
277
+ %eq_164 = arith.cmpi eq, %ileft_150, %iright_151 : tensor<8x16xi32, #linear> loc(#loc180)
278
+ %cond_165 = arith.cmpi sgt, %left_idx_161, %right_idx_162 : tensor<8x16xi32, #linear> loc(#loc181)
279
+ %cond_166 = arith.andi %eq_164, %cond_165 : tensor<8x16xi1, #linear> loc(#loc182)
280
+ %cond_167 = arith.ori %cond_163, %cond_166 : tensor<8x16xi1, #linear> loc(#loc183)
281
+ %cond_168 = arith.extui %cond_167 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
282
+ %cond_169 = arith.xori %cond_168, %flip_103 : tensor<8x16xi32, #linear> loc(#loc184)
283
+ %cond_170 = arith.cmpi ne, %cond_169, %cst : tensor<8x16xi32, #linear> loc(#loc185)
284
+ %ret_171 = arith.xori %ileft_150, %iright_151 : tensor<8x16xi32, #linear> loc(#loc186)
285
+ %ret_172 = arith.select %cond_170, %ret_171, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
286
+ %ret_173 = arith.xori %ret_137, %ret_172 : tensor<8x16xi32, #linear> loc(#loc188)
287
+ %new_idxs_174 = arith.xori %left_idx_161, %right_idx_162 : tensor<8x16xi32, #linear> loc(#loc189)
288
+ %new_idxs_175 = arith.select %cond_170, %new_idxs_174, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
289
+ %new_idxs_176 = arith.xori %new_idxs_140, %new_idxs_175 : tensor<8x16xi32, #linear> loc(#loc191)
290
+ %flip_177 = tt.broadcast %flip_47 : tensor<1x2x1xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc147)
291
+ %flip_178 = tt.reshape %flip_177 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc148)
292
+ %y_179 = tt.reshape %ret_173 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc154)
293
+ %ileft_180 = tt.broadcast %left_mask_52 : tensor<1x2x1xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc156)
294
+ %ileft_181 = arith.muli %y_179, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc156)
295
+ %ileft_182 = "tt.reduce"(%ileft_181) <{axis = 1 : i32}> ({
296
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
297
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
298
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
299
+ }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193)
300
+ %ileft_183 = tt.expand_dims %ileft_182 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc158)
301
+ %ileft_184 = tt.broadcast %ileft_183 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc159)
302
+ %iright_185 = arith.muli %y_179, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc160)
303
+ %iright_186 = "tt.reduce"(%iright_185) <{axis = 1 : i32}> ({
304
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
305
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
306
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
307
+ }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195)
308
+ %iright_187 = tt.expand_dims %iright_186 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc162)
309
+ %iright_188 = tt.broadcast %iright_187 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc163)
310
+ %ileft_189 = tt.reshape %ileft_184 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc164)
311
+ %iright_190 = tt.reshape %iright_188 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc165)
312
+ %y_idx_191 = tt.reshape %new_idxs_176 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc166)
313
+ %left_idx_192 = arith.muli %y_idx_191, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc168)
314
+ %left_idx_193 = "tt.reduce"(%left_idx_192) <{axis = 1 : i32}> ({
315
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
316
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
317
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
318
+ }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198)
319
+ %left_idx_194 = tt.expand_dims %left_idx_193 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc170)
320
+ %left_idx_195 = tt.broadcast %left_idx_194 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc171)
321
+ %right_idx_196 = arith.muli %y_idx_191, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc173)
322
+ %right_idx_197 = "tt.reduce"(%right_idx_196) <{axis = 1 : i32}> ({
323
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
324
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
325
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
326
+ }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201)
327
+ %right_idx_198 = tt.expand_dims %right_idx_197 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc175)
328
+ %right_idx_199 = tt.broadcast %right_idx_198 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc176)
329
+ %left_idx_200 = tt.reshape %left_idx_195 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc177)
330
+ %right_idx_201 = tt.reshape %right_idx_199 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc178)
331
+ %cond_202 = arith.cmpi slt, %ileft_189, %iright_190 : tensor<8x16xi32, #linear> loc(#loc179)
332
+ %eq_203 = arith.cmpi eq, %ileft_189, %iright_190 : tensor<8x16xi32, #linear> loc(#loc180)
333
+ %cond_204 = arith.cmpi sgt, %left_idx_200, %right_idx_201 : tensor<8x16xi32, #linear> loc(#loc181)
334
+ %cond_205 = arith.andi %eq_203, %cond_204 : tensor<8x16xi1, #linear> loc(#loc182)
335
+ %cond_206 = arith.ori %cond_202, %cond_205 : tensor<8x16xi1, #linear> loc(#loc183)
336
+ %cond_207 = arith.extui %cond_206 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
337
+ %cond_208 = arith.xori %cond_207, %flip_178 : tensor<8x16xi32, #linear> loc(#loc184)
338
+ %cond_209 = arith.cmpi ne, %cond_208, %cst : tensor<8x16xi32, #linear> loc(#loc185)
339
+ %ret_210 = arith.xori %ileft_189, %iright_190 : tensor<8x16xi32, #linear> loc(#loc186)
340
+ %ret_211 = arith.select %cond_209, %ret_210, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
341
+ %ret_212 = arith.xori %ret_173, %ret_211 : tensor<8x16xi32, #linear> loc(#loc188)
342
+ %new_idxs_213 = arith.xori %left_idx_200, %right_idx_201 : tensor<8x16xi32, #linear> loc(#loc189)
343
+ %new_idxs_214 = arith.select %cond_209, %new_idxs_213, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
344
+ %new_idxs_215 = arith.xori %new_idxs_176, %new_idxs_214 : tensor<8x16xi32, #linear> loc(#loc191)
345
+ %y_216 = tt.reshape %ret_212 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc154)
346
+ %ileft_217 = arith.muli %y_216, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc156)
347
+ %ileft_218 = "tt.reduce"(%ileft_217) <{axis = 1 : i32}> ({
348
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
349
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
350
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
351
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
352
+ %ileft_219 = tt.expand_dims %ileft_218 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc158)
353
+ %ileft_220 = tt.broadcast %ileft_219 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc159)
354
+ %iright_221 = arith.muli %y_216, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc160)
355
+ %iright_222 = "tt.reduce"(%iright_221) <{axis = 1 : i32}> ({
356
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
357
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
358
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
359
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
360
+ %iright_223 = tt.expand_dims %iright_222 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc162)
361
+ %iright_224 = tt.broadcast %iright_223 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc163)
362
+ %ileft_225 = tt.reshape %ileft_220 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc164)
363
+ %iright_226 = tt.reshape %iright_224 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc165)
364
+ %y_idx_227 = tt.reshape %new_idxs_215 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc166)
365
+ %left_idx_228 = arith.muli %y_idx_227, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc168)
366
+ %left_idx_229 = "tt.reduce"(%left_idx_228) <{axis = 1 : i32}> ({
367
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
368
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
369
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
370
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
371
+ %left_idx_230 = tt.expand_dims %left_idx_229 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc170)
372
+ %left_idx_231 = tt.broadcast %left_idx_230 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc171)
373
+ %right_idx_232 = arith.muli %y_idx_227, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc173)
374
+ %right_idx_233 = "tt.reduce"(%right_idx_232) <{axis = 1 : i32}> ({
375
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
376
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
377
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
378
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
379
+ %right_idx_234 = tt.expand_dims %right_idx_233 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc175)
380
+ %right_idx_235 = tt.broadcast %right_idx_234 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc176)
381
+ %left_idx_236 = tt.reshape %left_idx_231 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc177)
382
+ %right_idx_237 = tt.reshape %right_idx_235 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc178)
383
+ %cond_238 = arith.cmpi slt, %ileft_225, %iright_226 : tensor<8x16xi32, #linear> loc(#loc179)
384
+ %eq_239 = arith.cmpi eq, %ileft_225, %iright_226 : tensor<8x16xi32, #linear> loc(#loc180)
385
+ %cond_240 = arith.cmpi sgt, %left_idx_236, %right_idx_237 : tensor<8x16xi32, #linear> loc(#loc181)
386
+ %cond_241 = arith.andi %eq_239, %cond_240 : tensor<8x16xi1, #linear> loc(#loc182)
387
+ %cond_242 = arith.ori %cond_238, %cond_241 : tensor<8x16xi1, #linear> loc(#loc183)
388
+ %cond_243 = arith.extui %cond_242 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
389
+ %cond_244 = arith.xori %cond_243, %flip_178 : tensor<8x16xi32, #linear> loc(#loc184)
390
+ %cond_245 = arith.cmpi ne, %cond_244, %cst : tensor<8x16xi32, #linear> loc(#loc185)
391
+ %ret_246 = arith.xori %ileft_225, %iright_226 : tensor<8x16xi32, #linear> loc(#loc186)
392
+ %ret_247 = arith.select %cond_245, %ret_246, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
393
+ %ret_248 = arith.xori %ret_212, %ret_247 : tensor<8x16xi32, #linear> loc(#loc188)
394
+ %new_idxs_249 = arith.xori %left_idx_236, %right_idx_237 : tensor<8x16xi32, #linear> loc(#loc189)
395
+ %new_idxs_250 = arith.select %cond_245, %new_idxs_249, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
396
+ %new_idxs_251 = arith.xori %new_idxs_215, %new_idxs_250 : tensor<8x16xi32, #linear> loc(#loc191)
397
+ %y_252 = tt.reshape %ret_248 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc154)
398
+ %ileft_253 = arith.muli %y_252, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156)
399
+ %ileft_254 = "tt.reduce"(%ileft_253) <{axis = 1 : i32}> ({
400
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
401
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
402
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
403
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
404
+ %ileft_255 = tt.expand_dims %ileft_254 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158)
405
+ %ileft_256 = tt.broadcast %ileft_255 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159)
406
+ %iright_257 = arith.muli %y_252, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160)
407
+ %iright_258 = "tt.reduce"(%iright_257) <{axis = 1 : i32}> ({
408
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
409
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
410
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
411
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
412
+ %iright_259 = tt.expand_dims %iright_258 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162)
413
+ %iright_260 = tt.broadcast %iright_259 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163)
414
+ %ileft_261 = tt.reshape %ileft_256 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164)
415
+ %iright_262 = tt.reshape %iright_260 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165)
416
+ %y_idx_263 = tt.reshape %new_idxs_251 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc166)
417
+ %left_idx_264 = arith.muli %y_idx_263, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc168)
418
+ %left_idx_265 = "tt.reduce"(%left_idx_264) <{axis = 1 : i32}> ({
419
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
420
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
421
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
422
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
423
+ %left_idx_266 = tt.expand_dims %left_idx_265 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170)
424
+ %left_idx_267 = tt.broadcast %left_idx_266 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171)
425
+ %right_idx_268 = arith.muli %y_idx_263, %iright : tensor<64x2x1xi32, #linear1> loc(#loc173)
426
+ %right_idx_269 = "tt.reduce"(%right_idx_268) <{axis = 1 : i32}> ({
427
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
428
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
429
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
430
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
431
+ %right_idx_270 = tt.expand_dims %right_idx_269 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175)
432
+ %right_idx_271 = tt.broadcast %right_idx_270 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176)
433
+ %left_idx_272 = tt.reshape %left_idx_267 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177)
434
+ %right_idx_273 = tt.reshape %right_idx_271 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178)
435
+ %cond_274 = arith.cmpi slt, %ileft_261, %iright_262 : tensor<8x16xi32, #linear> loc(#loc179)
436
+ %eq_275 = arith.cmpi eq, %ileft_261, %iright_262 : tensor<8x16xi32, #linear> loc(#loc180)
437
+ %cond_276 = arith.cmpi sgt, %left_idx_272, %right_idx_273 : tensor<8x16xi32, #linear> loc(#loc181)
438
+ %cond_277 = arith.andi %eq_275, %cond_276 : tensor<8x16xi1, #linear> loc(#loc182)
439
+ %cond_278 = arith.ori %cond_274, %cond_277 : tensor<8x16xi1, #linear> loc(#loc183)
440
+ %cond_279 = arith.extui %cond_278 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
441
+ %cond_280 = arith.xori %cond_279, %flip_178 : tensor<8x16xi32, #linear> loc(#loc184)
442
+ %cond_281 = arith.cmpi ne, %cond_280, %cst : tensor<8x16xi32, #linear> loc(#loc185)
443
+ %ret_282 = arith.xori %ileft_261, %iright_262 : tensor<8x16xi32, #linear> loc(#loc186)
444
+ %ret_283 = arith.select %cond_281, %ret_282, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
445
+ %ret_284 = arith.xori %ret_248, %ret_283 : tensor<8x16xi32, #linear> loc(#loc188)
446
+ %new_idxs_285 = arith.xori %left_idx_272, %right_idx_273 : tensor<8x16xi32, #linear> loc(#loc189)
447
+ %new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
448
+ %new_idxs_287 = arith.xori %new_idxs_251, %new_idxs_286 : tensor<8x16xi32, #linear> loc(#loc191)
449
+ %y_288 = tt.reshape %ret_284 : tensor<8x16xi32, #linear> -> tensor<8x2x8xi32, #linear4> loc(#loc154)
450
+ %ileft_289 = tt.broadcast %left_mask_53 : tensor<1x2x1xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc156)
451
+ %ileft_290 = arith.muli %y_288, %ileft_289 : tensor<8x2x8xi32, #linear4> loc(#loc156)
452
+ %ileft_291 = "tt.reduce"(%ileft_290) <{axis = 1 : i32}> ({
453
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
454
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
455
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
456
+ }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc193)
457
+ %ileft_292 = tt.expand_dims %ileft_291 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc158)
458
+ %ileft_293 = tt.broadcast %ileft_292 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc159)
459
+ %iright_294 = arith.muli %y_288, %flip_177 : tensor<8x2x8xi32, #linear4> loc(#loc160)
460
+ %iright_295 = "tt.reduce"(%iright_294) <{axis = 1 : i32}> ({
461
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
462
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
463
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
464
+ }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc195)
465
+ %iright_296 = tt.expand_dims %iright_295 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc162)
466
+ %iright_297 = tt.broadcast %iright_296 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc163)
467
+ %ileft_298 = tt.reshape %ileft_293 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc164)
468
+ %iright_299 = tt.reshape %iright_297 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc165)
469
+ %y_idx_300 = tt.reshape %new_idxs_287 : tensor<8x16xi32, #linear> -> tensor<8x2x8xi32, #linear4> loc(#loc166)
470
+ %left_idx_301 = arith.muli %y_idx_300, %ileft_289 : tensor<8x2x8xi32, #linear4> loc(#loc168)
471
+ %left_idx_302 = "tt.reduce"(%left_idx_301) <{axis = 1 : i32}> ({
472
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
473
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
474
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
475
+ }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc198)
476
+ %left_idx_303 = tt.expand_dims %left_idx_302 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc170)
477
+ %left_idx_304 = tt.broadcast %left_idx_303 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc171)
478
+ %right_idx_305 = arith.muli %y_idx_300, %flip_177 : tensor<8x2x8xi32, #linear4> loc(#loc173)
479
+ %right_idx_306 = "tt.reduce"(%right_idx_305) <{axis = 1 : i32}> ({
480
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
481
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
482
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
483
+ }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc201)
484
+ %right_idx_307 = tt.expand_dims %right_idx_306 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc175)
485
+ %right_idx_308 = tt.broadcast %right_idx_307 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc176)
486
+ %left_idx_309 = tt.reshape %left_idx_304 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc177)
487
+ %right_idx_310 = tt.reshape %right_idx_308 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc178)
488
+ %cond_311 = arith.cmpi slt, %ileft_298, %iright_299 : tensor<8x16xi32, #linear> loc(#loc179)
489
+ %eq_312 = arith.cmpi eq, %ileft_298, %iright_299 : tensor<8x16xi32, #linear> loc(#loc180)
490
+ %cond_313 = arith.cmpi sgt, %left_idx_309, %right_idx_310 : tensor<8x16xi32, #linear> loc(#loc181)
491
+ %cond_314 = arith.andi %eq_312, %cond_313 : tensor<8x16xi1, #linear> loc(#loc182)
492
+ %cond_315 = arith.ori %cond_311, %cond_314 : tensor<8x16xi1, #linear> loc(#loc183)
493
+ %ret_316 = arith.xori %ileft_298, %iright_299 : tensor<8x16xi32, #linear> loc(#loc186)
494
+ %ret_317 = arith.select %cond_315, %ret_316, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
495
+ %ret_318 = arith.xori %ret_284, %ret_317 : tensor<8x16xi32, #linear> loc(#loc188)
496
+ %new_idxs_319 = arith.xori %left_idx_309, %right_idx_310 : tensor<8x16xi32, #linear> loc(#loc189)
497
+ %new_idxs_320 = arith.select %cond_315, %new_idxs_319, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
498
+ %new_idxs_321 = arith.xori %new_idxs_287, %new_idxs_320 : tensor<8x16xi32, #linear> loc(#loc191)
499
+ %y_322 = tt.reshape %ret_318 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc154)
500
+ %ileft_323 = arith.muli %y_322, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc156)
501
+ %ileft_324 = "tt.reduce"(%ileft_323) <{axis = 1 : i32}> ({
502
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
503
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
504
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
505
+ }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193)
506
+ %ileft_325 = tt.expand_dims %ileft_324 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc158)
507
+ %ileft_326 = tt.broadcast %ileft_325 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc159)
508
+ %iright_327 = arith.muli %y_322, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc160)
509
+ %iright_328 = "tt.reduce"(%iright_327) <{axis = 1 : i32}> ({
510
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
511
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
512
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
513
+ }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195)
514
+ %iright_329 = tt.expand_dims %iright_328 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc162)
515
+ %iright_330 = tt.broadcast %iright_329 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc163)
516
+ %ileft_331 = tt.reshape %ileft_326 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc164)
517
+ %iright_332 = tt.reshape %iright_330 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc165)
518
+ %y_idx_333 = tt.reshape %new_idxs_321 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc166)
519
+ %left_idx_334 = arith.muli %y_idx_333, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc168)
520
+ %left_idx_335 = "tt.reduce"(%left_idx_334) <{axis = 1 : i32}> ({
521
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
522
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
523
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
524
+ }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198)
525
+ %left_idx_336 = tt.expand_dims %left_idx_335 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc170)
526
+ %left_idx_337 = tt.broadcast %left_idx_336 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc171)
527
+ %right_idx_338 = arith.muli %y_idx_333, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc173)
528
+ %right_idx_339 = "tt.reduce"(%right_idx_338) <{axis = 1 : i32}> ({
529
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
530
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
531
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
532
+ }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201)
533
+ %right_idx_340 = tt.expand_dims %right_idx_339 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc175)
534
+ %right_idx_341 = tt.broadcast %right_idx_340 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc176)
535
+ %left_idx_342 = tt.reshape %left_idx_337 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc177)
536
+ %right_idx_343 = tt.reshape %right_idx_341 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc178)
537
+ %cond_344 = arith.cmpi slt, %ileft_331, %iright_332 : tensor<8x16xi32, #linear> loc(#loc179)
538
+ %eq_345 = arith.cmpi eq, %ileft_331, %iright_332 : tensor<8x16xi32, #linear> loc(#loc180)
539
+ %cond_346 = arith.cmpi sgt, %left_idx_342, %right_idx_343 : tensor<8x16xi32, #linear> loc(#loc181)
540
+ %cond_347 = arith.andi %eq_345, %cond_346 : tensor<8x16xi1, #linear> loc(#loc182)
541
+ %cond_348 = arith.ori %cond_344, %cond_347 : tensor<8x16xi1, #linear> loc(#loc183)
542
+ %ret_349 = arith.xori %ileft_331, %iright_332 : tensor<8x16xi32, #linear> loc(#loc186)
543
+ %ret_350 = arith.select %cond_348, %ret_349, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
544
+ %ret_351 = arith.xori %ret_318, %ret_350 : tensor<8x16xi32, #linear> loc(#loc188)
545
+ %new_idxs_352 = arith.xori %left_idx_342, %right_idx_343 : tensor<8x16xi32, #linear> loc(#loc189)
546
+ %new_idxs_353 = arith.select %cond_348, %new_idxs_352, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
547
+ %new_idxs_354 = arith.xori %new_idxs_321, %new_idxs_353 : tensor<8x16xi32, #linear> loc(#loc191)
548
+ %y_355 = tt.reshape %ret_351 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc154)
549
+ %ileft_356 = arith.muli %y_355, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc156)
550
+ %ileft_357 = "tt.reduce"(%ileft_356) <{axis = 1 : i32}> ({
551
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
552
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
553
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
554
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
555
+ %ileft_358 = tt.expand_dims %ileft_357 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc158)
556
+ %ileft_359 = tt.broadcast %ileft_358 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc159)
557
+ %iright_360 = arith.muli %y_355, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc160)
558
+ %iright_361 = "tt.reduce"(%iright_360) <{axis = 1 : i32}> ({
559
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
560
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
561
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
562
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
563
+ %iright_362 = tt.expand_dims %iright_361 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc162)
564
+ %iright_363 = tt.broadcast %iright_362 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc163)
565
+ %ileft_364 = tt.reshape %ileft_359 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc164)
566
+ %iright_365 = tt.reshape %iright_363 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc165)
567
+ %y_idx_366 = tt.reshape %new_idxs_354 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc166)
568
+ %left_idx_367 = arith.muli %y_idx_366, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc168)
569
+ %left_idx_368 = "tt.reduce"(%left_idx_367) <{axis = 1 : i32}> ({
570
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
571
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
572
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
573
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
574
+ %left_idx_369 = tt.expand_dims %left_idx_368 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc170)
575
+ %left_idx_370 = tt.broadcast %left_idx_369 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc171)
576
+ %right_idx_371 = arith.muli %y_idx_366, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc173)
577
+ %right_idx_372 = "tt.reduce"(%right_idx_371) <{axis = 1 : i32}> ({
578
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
579
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
580
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
581
+ }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
582
+ %right_idx_373 = tt.expand_dims %right_idx_372 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc175)
583
+ %right_idx_374 = tt.broadcast %right_idx_373 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc176)
584
+ %left_idx_375 = tt.reshape %left_idx_370 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc177)
585
+ %right_idx_376 = tt.reshape %right_idx_374 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc178)
586
+ %cond_377 = arith.cmpi slt, %ileft_364, %iright_365 : tensor<8x16xi32, #linear> loc(#loc179)
587
+ %eq_378 = arith.cmpi eq, %ileft_364, %iright_365 : tensor<8x16xi32, #linear> loc(#loc180)
588
+ %cond_379 = arith.cmpi sgt, %left_idx_375, %right_idx_376 : tensor<8x16xi32, #linear> loc(#loc181)
589
+ %cond_380 = arith.andi %eq_378, %cond_379 : tensor<8x16xi1, #linear> loc(#loc182)
590
+ %cond_381 = arith.ori %cond_377, %cond_380 : tensor<8x16xi1, #linear> loc(#loc183)
591
+ %ret_382 = arith.xori %ileft_364, %iright_365 : tensor<8x16xi32, #linear> loc(#loc186)
592
+ %ret_383 = arith.select %cond_381, %ret_382, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
593
+ %ret_384 = arith.xori %ret_351, %ret_383 : tensor<8x16xi32, #linear> loc(#loc188)
594
+ %new_idxs_385 = arith.xori %left_idx_375, %right_idx_376 : tensor<8x16xi32, #linear> loc(#loc189)
595
+ %new_idxs_386 = arith.select %cond_381, %new_idxs_385, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
596
+ %new_idxs_387 = arith.xori %new_idxs_354, %new_idxs_386 : tensor<8x16xi32, #linear> loc(#loc191)
597
+ %y_388 = tt.reshape %ret_384 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc154)
598
+ %ileft_389 = arith.muli %y_388, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156)
599
+ %ileft_390 = "tt.reduce"(%ileft_389) <{axis = 1 : i32}> ({
600
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
601
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
602
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
603
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
604
+ %ileft_391 = tt.expand_dims %ileft_390 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158)
605
+ %ileft_392 = tt.broadcast %ileft_391 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159)
606
+ %iright_393 = arith.muli %y_388, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160)
607
+ %iright_394 = "tt.reduce"(%iright_393) <{axis = 1 : i32}> ({
608
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
609
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
610
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
611
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
612
+ %iright_395 = tt.expand_dims %iright_394 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162)
613
+ %iright_396 = tt.broadcast %iright_395 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163)
614
+ %ileft_397 = tt.reshape %ileft_392 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164)
615
+ %iright_398 = tt.reshape %iright_396 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165)
616
+ %y_idx_399 = tt.reshape %new_idxs_387 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc166)
617
+ %left_idx_400 = arith.muli %y_idx_399, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc168)
618
+ %left_idx_401 = "tt.reduce"(%left_idx_400) <{axis = 1 : i32}> ({
619
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
620
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
621
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
622
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
623
+ %left_idx_402 = tt.expand_dims %left_idx_401 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170)
624
+ %left_idx_403 = tt.broadcast %left_idx_402 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171)
625
+ %right_idx_404 = arith.muli %y_idx_399, %iright : tensor<64x2x1xi32, #linear1> loc(#loc173)
626
+ %right_idx_405 = "tt.reduce"(%right_idx_404) <{axis = 1 : i32}> ({
627
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
628
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
629
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
630
+ }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
631
+ %right_idx_406 = tt.expand_dims %right_idx_405 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175)
632
+ %right_idx_407 = tt.broadcast %right_idx_406 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176)
633
+ %left_idx_408 = tt.reshape %left_idx_403 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177)
634
+ %right_idx_409 = tt.reshape %right_idx_407 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178)
635
+ %cond_410 = arith.cmpi slt, %ileft_397, %iright_398 : tensor<8x16xi32, #linear> loc(#loc179)
636
+ %eq_411 = arith.cmpi eq, %ileft_397, %iright_398 : tensor<8x16xi32, #linear> loc(#loc180)
637
+ %cond_412 = arith.cmpi sgt, %left_idx_408, %right_idx_409 : tensor<8x16xi32, #linear> loc(#loc181)
638
+ %cond_413 = arith.andi %eq_411, %cond_412 : tensor<8x16xi1, #linear> loc(#loc182)
639
+ %cond_414 = arith.ori %cond_410, %cond_413 : tensor<8x16xi1, #linear> loc(#loc183)
640
+ %new_idxs_415 = arith.xori %left_idx_408, %right_idx_409 : tensor<8x16xi32, #linear> loc(#loc189)
641
+ %new_idxs_416 = arith.select %cond_414, %new_idxs_415, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
642
+ %new_idxs_417 = arith.xori %new_idxs_387, %new_idxs_416 : tensor<8x16xi32, #linear> loc(#loc191)
643
+ %tmp7 = arith.extsi %tmp0_36 : tensor<8x16xi32, #blocked> to tensor<8x16xi64, #blocked> loc(#loc141)
644
+ %tmp10 = arith.select %tmp0_34, %tmp7, %cst_0 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc142)
645
+ %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({
646
+ ^bb0(%tmp11_419: i64 loc(callsite(#loc1 at #loc143)), %tmp11_420: i64 loc(callsite(#loc1 at #loc143))):
647
+ %tmp11_421 = arith.addi %tmp11_419, %tmp11_420 : i64 loc(#loc192)
648
+ tt.reduce.return %tmp11_421 : i64 loc(#loc152)
649
+ }) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc152)
650
+ %tmp11_418 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc144)
651
+ %tmp14 = arith.trunci %tmp11_418 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc145)
652
+ %0 = arith.muli %xindex_19, %cst_4 : tensor<8x1xi32, #blocked1> loc(#loc70)
653
+ %1 = tt.broadcast %r0_index_25 : tensor<1x16xi32, #blocked1> -> tensor<8x16xi32, #blocked1> loc(#loc71)
654
+ %2 = tt.broadcast %0 : tensor<8x1xi32, #blocked1> -> tensor<8x16xi32, #blocked1> loc(#loc71)
655
+ %3 = arith.addi %1, %2 : tensor<8x16xi32, #blocked1> loc(#loc71)
656
+ %4 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<8x16x!tt.ptr<i32>, #blocked1> loc(#loc72)
657
+ %5 = tt.addptr %4, %3 : tensor<8x16x!tt.ptr<i32>, #blocked1>, tensor<8x16xi32, #blocked1> loc(#loc72)
658
+ %6 = ttg.convert_layout %new_idxs_417 : tensor<8x16xi32, #linear> -> tensor<8x16xi32, #blocked1> loc(#loc73)
659
+ tt.store %5, %6, %tmp0_35 : tensor<8x16x!tt.ptr<i32>, #blocked1> loc(#loc73)
660
+ %7 = tt.splat %out_ptr3 : !tt.ptr<i32> -> tensor<8x1x!tt.ptr<i32>, #blocked> loc(#loc74)
661
+ %8 = tt.addptr %7, %xindex_18 : tensor<8x1x!tt.ptr<i32>, #blocked>, tensor<8x1xi32, #blocked> loc(#loc74)
662
+ tt.store %8, %tmp14, %xmask : tensor<8x1x!tt.ptr<i32>, #blocked> loc(#loc75)
663
+ tt.return loc(#loc76)
664
+ } loc(#loc)
665
+ } loc(#loc)
666
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:28)
667
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:33)
668
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:44)
669
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:23)
670
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":26:21)
671
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:38)
672
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":33:19)
673
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":34:19)
674
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:38)
675
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:35)
676
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:49)
677
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:45)
678
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:30)
679
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:54)
680
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":38:19)
681
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":40:33)
682
+ #loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44)
683
+ #loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60)
684
+ #loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68)
685
+ #loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22)
686
+ #loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21)
687
+ #loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40)
688
+ #loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
689
+ #loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
690
+ #loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65)
691
+ #loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78)
692
+ #loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41)
693
+ #loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67)
694
+ #loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80)
695
+ #loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30)
696
+ #loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32)
697
+ #loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29)
698
+ #loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36)
699
+ #loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23)
700
+ #loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25)
701
+ #loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53)
702
+ #loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66)
703
+ #loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37)
704
+ #loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23)
705
+ #loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54)
706
+ #loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67)
707
+ #loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36)
708
+ #loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38)
709
+ #loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22)
710
+ #loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21)
711
+ #loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40)
712
+ #loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29)
713
+ #loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23)
714
+ #loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19)
715
+ #loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28)
716
+ #loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38)
717
+ #loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46)
718
+ #loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15)
719
+ #loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48)
720
+ #loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59)
721
+ #loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22)
722
+ #loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":42:19)
723
+ #loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":44:34)
724
+ #loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:29)
725
+ #loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":48:21)
726
+ #loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:35)
727
+ #loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:32)
728
+ #loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:25)
729
+ #loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:47)
730
+ #loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:25)
731
+ #loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:37)
732
+ #loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:4)
733
+ #loc82 = loc("xoffset"(#loc2))
734
+ #loc83 = loc("xoffset"(#loc3))
735
+ #loc84 = loc("xindex"(#loc4))
736
+ #loc85 = loc("xindex"(#loc5))
737
+ #loc86 = loc("xmask"(#loc6))
738
+ #loc87 = loc("r0_index"(#loc7))
739
+ #loc88 = loc("x0"(#loc8))
740
+ #loc89 = loc("x1"(#loc9))
741
+ #loc90 = loc("tmp0"(#loc10))
742
+ #loc91 = loc("tmp0"(#loc11))
743
+ #loc92 = loc("tmp0"(#loc12))
744
+ #loc93 = loc("tmp0"(#loc13))
745
+ #loc94 = loc("tmp0"(#loc14))
746
+ #loc95 = loc("tmp0"(#loc15))
747
+ #loc96 = loc("tmp2"(#loc16))
748
+ #loc97 = loc("tmp4"(#loc17))
749
+ #loc98 = loc("flip"(#loc18))
750
+ #loc100 = loc("flip"(#loc21))
751
+ #loc101 = loc("flip"(#loc22))
752
+ #loc102 = loc("y"(#loc23))
753
+ #loc103 = loc("left_mask"(#loc25))
754
+ #loc104 = loc("ileft"(#loc26))
755
+ #loc106 = loc("ileft"(#loc30))
756
+ #loc107 = loc("ileft"(#loc31))
757
+ #loc108 = loc("iright"(#loc32))
758
+ #loc110 = loc("iright"(#loc34))
759
+ #loc111 = loc("iright"(#loc35))
760
+ #loc112 = loc("ileft"(#loc36))
761
+ #loc113 = loc("iright"(#loc37))
762
+ #loc114 = loc("y_idx"(#loc38))
763
+ #loc115 = loc("left_idx"(#loc39))
764
+ #loc116 = loc("left_idx"(#loc40))
765
+ #loc117 = loc("input"(#loc41))
766
+ #loc119 = loc("left_idx"(#loc43))
767
+ #loc120 = loc("left_idx"(#loc44))
768
+ #loc121 = loc("right_idx"(#loc45))
769
+ #loc122 = loc("right_idx"(#loc46))
770
+ #loc124 = loc("right_idx"(#loc48))
771
+ #loc125 = loc("right_idx"(#loc49))
772
+ #loc126 = loc("left_idx"(#loc50))
773
+ #loc127 = loc("right_idx"(#loc51))
774
+ #loc128 = loc("cond"(#loc52))
775
+ #loc129 = loc("eq"(#loc53))
776
+ #loc130 = loc("cond"(#loc54))
777
+ #loc131 = loc("cond"(#loc55))
778
+ #loc132 = loc("cond"(#loc56))
779
+ #loc133 = loc("cond"(#loc57))
780
+ #loc134 = loc("cond"(#loc58))
781
+ #loc135 = loc("ret"(#loc59))
782
+ #loc136 = loc("ret"(#loc60))
783
+ #loc137 = loc("ret"(#loc61))
784
+ #loc138 = loc("new_idxs"(#loc62))
785
+ #loc139 = loc("new_idxs"(#loc63))
786
+ #loc140 = loc("new_idxs"(#loc64))
787
+ #loc141 = loc("tmp7"(#loc65))
788
+ #loc142 = loc("tmp10"(#loc66))
789
+ #loc144 = loc("tmp11"(#loc68))
790
+ #loc145 = loc("tmp14"(#loc69))
791
+ #loc146 = loc(callsite(#loc98 at #loc99))
792
+ #loc147 = loc(callsite(#loc100 at #loc99))
793
+ #loc148 = loc(callsite(#loc101 at #loc99))
794
+ #loc150 = loc("cond"(#loc128))
795
+ #loc151 = loc("eq"(#loc129))
796
+ #loc152 = loc(callsite(#loc27 at #loc143))
797
+ #loc154 = loc(callsite(#loc102 at #loc149))
798
+ #loc155 = loc(callsite(#loc103 at #loc149))
799
+ #loc156 = loc(callsite(#loc104 at #loc149))
800
+ #loc158 = loc(callsite(#loc106 at #loc149))
801
+ #loc159 = loc(callsite(#loc107 at #loc149))
802
+ #loc160 = loc(callsite(#loc108 at #loc149))
803
+ #loc162 = loc(callsite(#loc110 at #loc149))
804
+ #loc163 = loc(callsite(#loc111 at #loc149))
805
+ #loc164 = loc(callsite(#loc112 at #loc149))
806
+ #loc165 = loc(callsite(#loc113 at #loc149))
807
+ #loc166 = loc(callsite(#loc114 at #loc149))
808
+ #loc167 = loc(callsite(#loc115 at #loc149))
809
+ #loc168 = loc(callsite(#loc116 at #loc149))
810
+ #loc170 = loc(callsite(#loc119 at #loc149))
811
+ #loc171 = loc(callsite(#loc120 at #loc149))
812
+ #loc172 = loc(callsite(#loc121 at #loc149))
813
+ #loc173 = loc(callsite(#loc122 at #loc149))
814
+ #loc175 = loc(callsite(#loc124 at #loc149))
815
+ #loc176 = loc(callsite(#loc125 at #loc149))
816
+ #loc177 = loc(callsite(#loc126 at #loc149))
817
+ #loc178 = loc(callsite(#loc127 at #loc149))
818
+ #loc179 = loc(callsite(#loc150 at #loc149))
819
+ #loc180 = loc(callsite(#loc151 at #loc149))
820
+ #loc181 = loc(callsite(#loc130 at #loc149))
821
+ #loc182 = loc(callsite(#loc131 at #loc149))
822
+ #loc183 = loc(callsite(#loc132 at #loc149))
823
+ #loc184 = loc(callsite(#loc133 at #loc149))
824
+ #loc185 = loc(callsite(#loc134 at #loc149))
825
+ #loc186 = loc(callsite(#loc135 at #loc149))
826
+ #loc187 = loc(callsite(#loc136 at #loc149))
827
+ #loc188 = loc(callsite(#loc137 at #loc149))
828
+ #loc189 = loc(callsite(#loc138 at #loc149))
829
+ #loc190 = loc(callsite(#loc139 at #loc149))
830
+ #loc191 = loc(callsite(#loc140 at #loc149))
831
+ #loc192 = loc(callsite(#loc29 at #loc152))
832
+ #loc193 = loc(callsite(#loc27 at #loc157))
833
+ #loc195 = loc(callsite(#loc27 at #loc161))
834
+ #loc197 = loc(callsite(#loc117 at #loc169))
835
+ #loc198 = loc(callsite(#loc27 at #loc169))
836
+ #loc200 = loc(callsite(#loc117 at #loc174))
837
+ #loc201 = loc(callsite(#loc27 at #loc174))
838
+ #loc203 = loc(callsite(#loc29 at #loc193))
839
+ #loc204 = loc(callsite(#loc29 at #loc195))
840
+ #loc205 = loc(callsite(#loc29 at #loc198))
841
+ #loc206 = loc(callsite(#loc29 at #loc201))
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir ADDED
@@ -0,0 +1,799 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":41:67)
4
+ #loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12)
5
+ #loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73)
6
+ #loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51)
7
+ #loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53)
8
+ #loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50)
9
+ #loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51)
10
+ #loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:26)
11
+ #loc80 = loc("in_ptr0"(#loc))
12
+ #loc81 = loc("out_ptr2"(#loc))
13
+ #loc82 = loc("out_ptr3"(#loc))
14
+ #loc83 = loc("xnumel"(#loc))
15
+ #loc84 = loc("r0_numel"(#loc))
16
+ #loc106 = loc(callsite(#loc23 at #loc2))
17
+ #loc113 = loc("ileft"(#loc32))
18
+ #loc117 = loc("iright"(#loc37))
19
+ #loc126 = loc("left_idx"(#loc46))
20
+ #loc131 = loc("right_idx"(#loc51))
21
+ #loc150 = loc("tmp11"(#loc70))
22
+ #loc157 = loc(callsite(#loc28 at #loc106))
23
+ #loc161 = loc(callsite(#loc1 at #loc150))
24
+ #loc165 = loc(callsite(#loc113 at #loc157))
25
+ #loc169 = loc(callsite(#loc117 at #loc157))
26
+ #loc177 = loc(callsite(#loc126 at #loc157))
27
+ #loc182 = loc(callsite(#loc131 at #loc157))
28
+ #loc202 = loc(callsite(#loc1 at #loc165))
29
+ #loc204 = loc(callsite(#loc1 at #loc169))
30
+ #loc207 = loc(callsite(#loc1 at #loc177))
31
+ #loc210 = loc(callsite(#loc1 at #loc182))
32
+ module {
33
+ tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
34
+ %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc85)
35
+ %cst_0 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc1)
36
+ %tmp10 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc86)
37
+ %tmp0 = arith.constant dense<272> : tensor<8x1xi32> loc(#loc87)
38
+ %tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc88)
39
+ %cst_2 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc1)
40
+ %xmask = arith.constant dense<32> : tensor<8x1xi32> loc(#loc89)
41
+ %c8_i32 = arith.constant 8 : i32 loc(#loc1)
42
+ %xoffset = tt.get_program_id x : i32 loc(#loc90)
43
+ %xoffset_3 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc91)
44
+ %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc92)
45
+ %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc93)
46
+ %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<8x1xi32> loc(#loc94)
47
+ %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<8x1xi32> loc(#loc94)
48
+ %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<8x1xi32> loc(#loc89)
49
+ %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc95)
50
+ %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc96)
51
+ %x0 = arith.remsi %xindex_6, %cst_2 : tensor<8x1xi32> loc(#loc97)
52
+ %x1 = arith.divsi %xindex_6, %cst_2 : tensor<8x1xi32> loc(#loc98)
53
+ %tmp0_9 = arith.muli %r0_index_8, %tmp0_1 : tensor<1x16xi32> loc(#loc88)
54
+ %tmp0_10 = tt.broadcast %x0 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc99)
55
+ %tmp0_11 = tt.broadcast %tmp0_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc99)
56
+ %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<8x16xi32> loc(#loc99)
57
+ %tmp0_13 = arith.muli %x1, %tmp0 : tensor<8x1xi32> loc(#loc87)
58
+ %tmp0_14 = tt.broadcast %tmp0_13 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc100)
59
+ %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<8x16xi32> loc(#loc100)
60
+ %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<8x16x!tt.ptr<i32>> loc(#loc101)
61
+ %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<8x16x!tt.ptr<i32>>, tensor<8x16xi32> loc(#loc101)
62
+ %tmp0_18 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc102)
63
+ %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %cst_0 : tensor<8x16x!tt.ptr<i32>> loc(#loc102)
64
+ %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc103)
65
+ %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc104)
66
+ %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc153)
67
+ %flip_20 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc154)
68
+ %flip_21 = tt.expand_dims %flip_20 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc154)
69
+ %flip_22 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc155)
70
+ %flip_23 = tt.reshape %flip_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc156)
71
+ %y = tt.reshape %tmp0_19 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162)
72
+ %left_mask = arith.subi %cst, %flip_21 : tensor<1x2x1xi32> loc(#loc163)
73
+ %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc164)
74
+ %ileft_24 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc164)
75
+ %ileft_25 = "tt.reduce"(%ileft_24) <{axis = 1 : i32}> ({
76
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
77
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
78
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
79
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201)
80
+ %ileft_26 = tt.expand_dims %ileft_25 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166)
81
+ %ileft_27 = tt.broadcast %ileft_26 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167)
82
+ %iright = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc168)
83
+ %iright_28 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc168)
84
+ %iright_29 = "tt.reduce"(%iright_28) <{axis = 1 : i32}> ({
85
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
86
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
87
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
88
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203)
89
+ %iright_30 = tt.expand_dims %iright_29 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170)
90
+ %iright_31 = tt.broadcast %iright_30 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171)
91
+ %ileft_32 = tt.reshape %ileft_27 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172)
92
+ %iright_33 = tt.reshape %iright_31 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173)
93
+ %y_idx = tt.reshape %tmp4 : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc174)
94
+ %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc175)
95
+ %left_idx_34 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc176)
96
+ %left_idx_35 = arith.muli %y_idx, %left_idx_34 : tensor<64x2x1xi16> loc(#loc176)
97
+ %input = arith.extsi %left_idx_35 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc205)
98
+ %left_idx_36 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
99
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
100
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
101
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
102
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206)
103
+ %left_idx_37 = tt.expand_dims %left_idx_36 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178)
104
+ %left_idx_38 = tt.broadcast %left_idx_37 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179)
105
+ %right_idx = arith.trunci %flip_21 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc180)
106
+ %right_idx_39 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc181)
107
+ %right_idx_40 = arith.muli %y_idx, %right_idx_39 : tensor<64x2x1xi16> loc(#loc181)
108
+ %input_41 = arith.extsi %right_idx_40 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc208)
109
+ %right_idx_42 = "tt.reduce"(%input_41) <{axis = 1 : i32}> ({
110
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
111
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
112
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
113
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209)
114
+ %right_idx_43 = tt.expand_dims %right_idx_42 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183)
115
+ %right_idx_44 = tt.broadcast %right_idx_43 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184)
116
+ %left_idx_45 = tt.reshape %left_idx_38 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185)
117
+ %right_idx_46 = tt.reshape %right_idx_44 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186)
118
+ %cond = arith.cmpi slt, %ileft_32, %iright_33 : tensor<8x16xi32> loc(#loc187)
119
+ %eq = arith.cmpi eq, %ileft_32, %iright_33 : tensor<8x16xi32> loc(#loc188)
120
+ %cond_47 = arith.cmpi sgt, %left_idx_45, %right_idx_46 : tensor<8x16xi32> loc(#loc189)
121
+ %cond_48 = arith.andi %eq, %cond_47 : tensor<8x16xi1> loc(#loc190)
122
+ %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc191)
123
+ %cond_50 = arith.extui %cond_49 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
124
+ %cond_51 = arith.xori %cond_50, %flip_23 : tensor<8x16xi32> loc(#loc192)
125
+ %cond_52 = arith.cmpi ne, %cond_51, %cst_0 : tensor<8x16xi32> loc(#loc193)
126
+ %ret = arith.xori %ileft_32, %iright_33 : tensor<8x16xi32> loc(#loc194)
127
+ %ret_53 = arith.select %cond_52, %ret, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
128
+ %ret_54 = arith.xori %tmp0_19, %ret_53 : tensor<8x16xi32> loc(#loc196)
129
+ %new_idxs = arith.xori %left_idx_45, %right_idx_46 : tensor<8x16xi32> loc(#loc197)
130
+ %new_idxs_55 = arith.select %cond_52, %new_idxs, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
131
+ %new_idxs_56 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc199)
132
+ %new_idxs_57 = tt.broadcast %new_idxs_56 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc199)
133
+ %new_idxs_58 = arith.xori %new_idxs_57, %new_idxs_55 : tensor<8x16xi32> loc(#loc199)
134
+ %flip_59 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc155)
135
+ %flip_60 = tt.reshape %flip_59 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc156)
136
+ %y_61 = tt.reshape %ret_54 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc162)
137
+ %ileft_62 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc164)
138
+ %ileft_63 = arith.muli %y_61, %ileft_62 : tensor<32x2x2xi32> loc(#loc164)
139
+ %ileft_64 = "tt.reduce"(%ileft_63) <{axis = 1 : i32}> ({
140
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
141
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
142
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
143
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc201)
144
+ %ileft_65 = tt.expand_dims %ileft_64 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc166)
145
+ %ileft_66 = tt.broadcast %ileft_65 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc167)
146
+ %iright_67 = arith.muli %y_61, %flip_22 : tensor<32x2x2xi32> loc(#loc168)
147
+ %iright_68 = "tt.reduce"(%iright_67) <{axis = 1 : i32}> ({
148
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
149
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
150
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
151
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc203)
152
+ %iright_69 = tt.expand_dims %iright_68 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc170)
153
+ %iright_70 = tt.broadcast %iright_69 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc171)
154
+ %ileft_71 = tt.reshape %ileft_66 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc172)
155
+ %iright_72 = tt.reshape %iright_70 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc173)
156
+ %y_idx_73 = tt.reshape %new_idxs_58 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc174)
157
+ %left_idx_74 = arith.muli %y_idx_73, %ileft_62 : tensor<32x2x2xi32> loc(#loc176)
158
+ %left_idx_75 = "tt.reduce"(%left_idx_74) <{axis = 1 : i32}> ({
159
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
160
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
161
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
162
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc206)
163
+ %left_idx_76 = tt.expand_dims %left_idx_75 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc178)
164
+ %left_idx_77 = tt.broadcast %left_idx_76 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc179)
165
+ %right_idx_78 = arith.muli %y_idx_73, %flip_22 : tensor<32x2x2xi32> loc(#loc181)
166
+ %right_idx_79 = "tt.reduce"(%right_idx_78) <{axis = 1 : i32}> ({
167
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
168
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
169
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
170
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc209)
171
+ %right_idx_80 = tt.expand_dims %right_idx_79 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc183)
172
+ %right_idx_81 = tt.broadcast %right_idx_80 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc184)
173
+ %left_idx_82 = tt.reshape %left_idx_77 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc185)
174
+ %right_idx_83 = tt.reshape %right_idx_81 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc186)
175
+ %cond_84 = arith.cmpi slt, %ileft_71, %iright_72 : tensor<8x16xi32> loc(#loc187)
176
+ %eq_85 = arith.cmpi eq, %ileft_71, %iright_72 : tensor<8x16xi32> loc(#loc188)
177
+ %cond_86 = arith.cmpi sgt, %left_idx_82, %right_idx_83 : tensor<8x16xi32> loc(#loc189)
178
+ %cond_87 = arith.andi %eq_85, %cond_86 : tensor<8x16xi1> loc(#loc190)
179
+ %cond_88 = arith.ori %cond_84, %cond_87 : tensor<8x16xi1> loc(#loc191)
180
+ %cond_89 = arith.extui %cond_88 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
181
+ %cond_90 = arith.xori %cond_89, %flip_60 : tensor<8x16xi32> loc(#loc192)
182
+ %cond_91 = arith.cmpi ne, %cond_90, %cst_0 : tensor<8x16xi32> loc(#loc193)
183
+ %ret_92 = arith.xori %ileft_71, %iright_72 : tensor<8x16xi32> loc(#loc194)
184
+ %ret_93 = arith.select %cond_91, %ret_92, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
185
+ %ret_94 = arith.xori %ret_54, %ret_93 : tensor<8x16xi32> loc(#loc196)
186
+ %new_idxs_95 = arith.xori %left_idx_82, %right_idx_83 : tensor<8x16xi32> loc(#loc197)
187
+ %new_idxs_96 = arith.select %cond_91, %new_idxs_95, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
188
+ %new_idxs_97 = arith.xori %new_idxs_58, %new_idxs_96 : tensor<8x16xi32> loc(#loc199)
189
+ %y_98 = tt.reshape %ret_94 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162)
190
+ %ileft_99 = arith.muli %y_98, %ileft : tensor<64x2x1xi32> loc(#loc164)
191
+ %ileft_100 = "tt.reduce"(%ileft_99) <{axis = 1 : i32}> ({
192
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
193
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
194
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
195
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201)
196
+ %ileft_101 = tt.expand_dims %ileft_100 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166)
197
+ %ileft_102 = tt.broadcast %ileft_101 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167)
198
+ %iright_103 = arith.muli %y_98, %iright : tensor<64x2x1xi32> loc(#loc168)
199
+ %iright_104 = "tt.reduce"(%iright_103) <{axis = 1 : i32}> ({
200
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
201
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
202
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
203
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203)
204
+ %iright_105 = tt.expand_dims %iright_104 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170)
205
+ %iright_106 = tt.broadcast %iright_105 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171)
206
+ %ileft_107 = tt.reshape %ileft_102 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172)
207
+ %iright_108 = tt.reshape %iright_106 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173)
208
+ %y_idx_109 = tt.reshape %new_idxs_97 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc174)
209
+ %left_idx_110 = arith.muli %y_idx_109, %ileft : tensor<64x2x1xi32> loc(#loc176)
210
+ %left_idx_111 = "tt.reduce"(%left_idx_110) <{axis = 1 : i32}> ({
211
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
212
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
213
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
214
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206)
215
+ %left_idx_112 = tt.expand_dims %left_idx_111 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178)
216
+ %left_idx_113 = tt.broadcast %left_idx_112 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179)
217
+ %right_idx_114 = arith.muli %y_idx_109, %iright : tensor<64x2x1xi32> loc(#loc181)
218
+ %right_idx_115 = "tt.reduce"(%right_idx_114) <{axis = 1 : i32}> ({
219
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
220
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
221
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
222
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209)
223
+ %right_idx_116 = tt.expand_dims %right_idx_115 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183)
224
+ %right_idx_117 = tt.broadcast %right_idx_116 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184)
225
+ %left_idx_118 = tt.reshape %left_idx_113 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185)
226
+ %right_idx_119 = tt.reshape %right_idx_117 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186)
227
+ %cond_120 = arith.cmpi slt, %ileft_107, %iright_108 : tensor<8x16xi32> loc(#loc187)
228
+ %eq_121 = arith.cmpi eq, %ileft_107, %iright_108 : tensor<8x16xi32> loc(#loc188)
229
+ %cond_122 = arith.cmpi sgt, %left_idx_118, %right_idx_119 : tensor<8x16xi32> loc(#loc189)
230
+ %cond_123 = arith.andi %eq_121, %cond_122 : tensor<8x16xi1> loc(#loc190)
231
+ %cond_124 = arith.ori %cond_120, %cond_123 : tensor<8x16xi1> loc(#loc191)
232
+ %cond_125 = arith.extui %cond_124 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
233
+ %cond_126 = arith.xori %cond_125, %flip_60 : tensor<8x16xi32> loc(#loc192)
234
+ %cond_127 = arith.cmpi ne, %cond_126, %cst_0 : tensor<8x16xi32> loc(#loc193)
235
+ %ret_128 = arith.xori %ileft_107, %iright_108 : tensor<8x16xi32> loc(#loc194)
236
+ %ret_129 = arith.select %cond_127, %ret_128, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
237
+ %ret_130 = arith.xori %ret_94, %ret_129 : tensor<8x16xi32> loc(#loc196)
238
+ %new_idxs_131 = arith.xori %left_idx_118, %right_idx_119 : tensor<8x16xi32> loc(#loc197)
239
+ %new_idxs_132 = arith.select %cond_127, %new_idxs_131, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
240
+ %new_idxs_133 = arith.xori %new_idxs_97, %new_idxs_132 : tensor<8x16xi32> loc(#loc199)
241
+ %flip_134 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc155)
242
+ %flip_135 = tt.reshape %flip_134 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc156)
243
+ %y_136 = tt.reshape %ret_130 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc162)
244
+ %ileft_137 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc164)
245
+ %ileft_138 = arith.muli %y_136, %ileft_137 : tensor<16x2x4xi32> loc(#loc164)
246
+ %ileft_139 = "tt.reduce"(%ileft_138) <{axis = 1 : i32}> ({
247
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
248
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
249
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
250
+ }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc201)
251
+ %ileft_140 = tt.expand_dims %ileft_139 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc166)
252
+ %ileft_141 = tt.broadcast %ileft_140 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc167)
253
+ %iright_142 = arith.muli %y_136, %flip_59 : tensor<16x2x4xi32> loc(#loc168)
254
+ %iright_143 = "tt.reduce"(%iright_142) <{axis = 1 : i32}> ({
255
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
256
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
257
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
258
+ }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc203)
259
+ %iright_144 = tt.expand_dims %iright_143 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc170)
260
+ %iright_145 = tt.broadcast %iright_144 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc171)
261
+ %ileft_146 = tt.reshape %ileft_141 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc172)
262
+ %iright_147 = tt.reshape %iright_145 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc173)
263
+ %y_idx_148 = tt.reshape %new_idxs_133 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc174)
264
+ %left_idx_149 = arith.muli %y_idx_148, %ileft_137 : tensor<16x2x4xi32> loc(#loc176)
265
+ %left_idx_150 = "tt.reduce"(%left_idx_149) <{axis = 1 : i32}> ({
266
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
267
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
268
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
269
+ }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc206)
270
+ %left_idx_151 = tt.expand_dims %left_idx_150 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc178)
271
+ %left_idx_152 = tt.broadcast %left_idx_151 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc179)
272
+ %right_idx_153 = arith.muli %y_idx_148, %flip_59 : tensor<16x2x4xi32> loc(#loc181)
273
+ %right_idx_154 = "tt.reduce"(%right_idx_153) <{axis = 1 : i32}> ({
274
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
275
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
276
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
277
+ }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc209)
278
+ %right_idx_155 = tt.expand_dims %right_idx_154 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc183)
279
+ %right_idx_156 = tt.broadcast %right_idx_155 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc184)
280
+ %left_idx_157 = tt.reshape %left_idx_152 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc185)
281
+ %right_idx_158 = tt.reshape %right_idx_156 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc186)
282
+ %cond_159 = arith.cmpi slt, %ileft_146, %iright_147 : tensor<8x16xi32> loc(#loc187)
283
+ %eq_160 = arith.cmpi eq, %ileft_146, %iright_147 : tensor<8x16xi32> loc(#loc188)
284
+ %cond_161 = arith.cmpi sgt, %left_idx_157, %right_idx_158 : tensor<8x16xi32> loc(#loc189)
285
+ %cond_162 = arith.andi %eq_160, %cond_161 : tensor<8x16xi1> loc(#loc190)
286
+ %cond_163 = arith.ori %cond_159, %cond_162 : tensor<8x16xi1> loc(#loc191)
287
+ %cond_164 = arith.extui %cond_163 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
288
+ %cond_165 = arith.xori %cond_164, %flip_135 : tensor<8x16xi32> loc(#loc192)
289
+ %cond_166 = arith.cmpi ne, %cond_165, %cst_0 : tensor<8x16xi32> loc(#loc193)
290
+ %ret_167 = arith.xori %ileft_146, %iright_147 : tensor<8x16xi32> loc(#loc194)
291
+ %ret_168 = arith.select %cond_166, %ret_167, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
292
+ %ret_169 = arith.xori %ret_130, %ret_168 : tensor<8x16xi32> loc(#loc196)
293
+ %new_idxs_170 = arith.xori %left_idx_157, %right_idx_158 : tensor<8x16xi32> loc(#loc197)
294
+ %new_idxs_171 = arith.select %cond_166, %new_idxs_170, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
295
+ %new_idxs_172 = arith.xori %new_idxs_133, %new_idxs_171 : tensor<8x16xi32> loc(#loc199)
296
+ %y_173 = tt.reshape %ret_169 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc162)
297
+ %ileft_174 = arith.muli %y_173, %ileft_62 : tensor<32x2x2xi32> loc(#loc164)
298
+ %ileft_175 = "tt.reduce"(%ileft_174) <{axis = 1 : i32}> ({
299
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
300
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
301
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
302
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc201)
303
+ %ileft_176 = tt.expand_dims %ileft_175 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc166)
304
+ %ileft_177 = tt.broadcast %ileft_176 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc167)
305
+ %iright_178 = arith.muli %y_173, %flip_22 : tensor<32x2x2xi32> loc(#loc168)
306
+ %iright_179 = "tt.reduce"(%iright_178) <{axis = 1 : i32}> ({
307
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
308
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
309
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
310
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc203)
311
+ %iright_180 = tt.expand_dims %iright_179 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc170)
312
+ %iright_181 = tt.broadcast %iright_180 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc171)
313
+ %ileft_182 = tt.reshape %ileft_177 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc172)
314
+ %iright_183 = tt.reshape %iright_181 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc173)
315
+ %y_idx_184 = tt.reshape %new_idxs_172 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc174)
316
+ %left_idx_185 = arith.muli %y_idx_184, %ileft_62 : tensor<32x2x2xi32> loc(#loc176)
317
+ %left_idx_186 = "tt.reduce"(%left_idx_185) <{axis = 1 : i32}> ({
318
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
319
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
320
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
321
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc206)
322
+ %left_idx_187 = tt.expand_dims %left_idx_186 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc178)
323
+ %left_idx_188 = tt.broadcast %left_idx_187 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc179)
324
+ %right_idx_189 = arith.muli %y_idx_184, %flip_22 : tensor<32x2x2xi32> loc(#loc181)
325
+ %right_idx_190 = "tt.reduce"(%right_idx_189) <{axis = 1 : i32}> ({
326
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
327
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
328
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
329
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc209)
330
+ %right_idx_191 = tt.expand_dims %right_idx_190 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc183)
331
+ %right_idx_192 = tt.broadcast %right_idx_191 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc184)
332
+ %left_idx_193 = tt.reshape %left_idx_188 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc185)
333
+ %right_idx_194 = tt.reshape %right_idx_192 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc186)
334
+ %cond_195 = arith.cmpi slt, %ileft_182, %iright_183 : tensor<8x16xi32> loc(#loc187)
335
+ %eq_196 = arith.cmpi eq, %ileft_182, %iright_183 : tensor<8x16xi32> loc(#loc188)
336
+ %cond_197 = arith.cmpi sgt, %left_idx_193, %right_idx_194 : tensor<8x16xi32> loc(#loc189)
337
+ %cond_198 = arith.andi %eq_196, %cond_197 : tensor<8x16xi1> loc(#loc190)
338
+ %cond_199 = arith.ori %cond_195, %cond_198 : tensor<8x16xi1> loc(#loc191)
339
+ %cond_200 = arith.extui %cond_199 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
340
+ %cond_201 = arith.xori %cond_200, %flip_135 : tensor<8x16xi32> loc(#loc192)
341
+ %cond_202 = arith.cmpi ne, %cond_201, %cst_0 : tensor<8x16xi32> loc(#loc193)
342
+ %ret_203 = arith.xori %ileft_182, %iright_183 : tensor<8x16xi32> loc(#loc194)
343
+ %ret_204 = arith.select %cond_202, %ret_203, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
344
+ %ret_205 = arith.xori %ret_169, %ret_204 : tensor<8x16xi32> loc(#loc196)
345
+ %new_idxs_206 = arith.xori %left_idx_193, %right_idx_194 : tensor<8x16xi32> loc(#loc197)
346
+ %new_idxs_207 = arith.select %cond_202, %new_idxs_206, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
347
+ %new_idxs_208 = arith.xori %new_idxs_172, %new_idxs_207 : tensor<8x16xi32> loc(#loc199)
348
+ %y_209 = tt.reshape %ret_205 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162)
349
+ %ileft_210 = arith.muli %y_209, %ileft : tensor<64x2x1xi32> loc(#loc164)
350
+ %ileft_211 = "tt.reduce"(%ileft_210) <{axis = 1 : i32}> ({
351
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
352
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
353
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
354
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201)
355
+ %ileft_212 = tt.expand_dims %ileft_211 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166)
356
+ %ileft_213 = tt.broadcast %ileft_212 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167)
357
+ %iright_214 = arith.muli %y_209, %iright : tensor<64x2x1xi32> loc(#loc168)
358
+ %iright_215 = "tt.reduce"(%iright_214) <{axis = 1 : i32}> ({
359
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
360
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
361
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
362
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203)
363
+ %iright_216 = tt.expand_dims %iright_215 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170)
364
+ %iright_217 = tt.broadcast %iright_216 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171)
365
+ %ileft_218 = tt.reshape %ileft_213 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172)
366
+ %iright_219 = tt.reshape %iright_217 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173)
367
+ %y_idx_220 = tt.reshape %new_idxs_208 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc174)
368
+ %left_idx_221 = arith.muli %y_idx_220, %ileft : tensor<64x2x1xi32> loc(#loc176)
369
+ %left_idx_222 = "tt.reduce"(%left_idx_221) <{axis = 1 : i32}> ({
370
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
371
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
372
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
373
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206)
374
+ %left_idx_223 = tt.expand_dims %left_idx_222 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178)
375
+ %left_idx_224 = tt.broadcast %left_idx_223 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179)
376
+ %right_idx_225 = arith.muli %y_idx_220, %iright : tensor<64x2x1xi32> loc(#loc181)
377
+ %right_idx_226 = "tt.reduce"(%right_idx_225) <{axis = 1 : i32}> ({
378
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
379
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
380
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
381
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209)
382
+ %right_idx_227 = tt.expand_dims %right_idx_226 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183)
383
+ %right_idx_228 = tt.broadcast %right_idx_227 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184)
384
+ %left_idx_229 = tt.reshape %left_idx_224 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185)
385
+ %right_idx_230 = tt.reshape %right_idx_228 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186)
386
+ %cond_231 = arith.cmpi slt, %ileft_218, %iright_219 : tensor<8x16xi32> loc(#loc187)
387
+ %eq_232 = arith.cmpi eq, %ileft_218, %iright_219 : tensor<8x16xi32> loc(#loc188)
388
+ %cond_233 = arith.cmpi sgt, %left_idx_229, %right_idx_230 : tensor<8x16xi32> loc(#loc189)
389
+ %cond_234 = arith.andi %eq_232, %cond_233 : tensor<8x16xi1> loc(#loc190)
390
+ %cond_235 = arith.ori %cond_231, %cond_234 : tensor<8x16xi1> loc(#loc191)
391
+ %cond_236 = arith.extui %cond_235 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
392
+ %cond_237 = arith.xori %cond_236, %flip_135 : tensor<8x16xi32> loc(#loc192)
393
+ %cond_238 = arith.cmpi ne, %cond_237, %cst_0 : tensor<8x16xi32> loc(#loc193)
394
+ %ret_239 = arith.xori %ileft_218, %iright_219 : tensor<8x16xi32> loc(#loc194)
395
+ %ret_240 = arith.select %cond_238, %ret_239, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
396
+ %ret_241 = arith.xori %ret_205, %ret_240 : tensor<8x16xi32> loc(#loc196)
397
+ %new_idxs_242 = arith.xori %left_idx_229, %right_idx_230 : tensor<8x16xi32> loc(#loc197)
398
+ %new_idxs_243 = arith.select %cond_238, %new_idxs_242, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
399
+ %new_idxs_244 = arith.xori %new_idxs_208, %new_idxs_243 : tensor<8x16xi32> loc(#loc199)
400
+ %y_245 = tt.reshape %ret_241 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc162)
401
+ %ileft_246 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc164)
402
+ %ileft_247 = arith.muli %y_245, %ileft_246 : tensor<8x2x8xi32> loc(#loc164)
403
+ %ileft_248 = "tt.reduce"(%ileft_247) <{axis = 1 : i32}> ({
404
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
405
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
406
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
407
+ }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc201)
408
+ %ileft_249 = tt.expand_dims %ileft_248 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc166)
409
+ %ileft_250 = tt.broadcast %ileft_249 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc167)
410
+ %iright_251 = arith.muli %y_245, %flip_134 : tensor<8x2x8xi32> loc(#loc168)
411
+ %iright_252 = "tt.reduce"(%iright_251) <{axis = 1 : i32}> ({
412
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
413
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
414
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
415
+ }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc203)
416
+ %iright_253 = tt.expand_dims %iright_252 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc170)
417
+ %iright_254 = tt.broadcast %iright_253 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc171)
418
+ %ileft_255 = tt.reshape %ileft_250 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc172)
419
+ %iright_256 = tt.reshape %iright_254 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc173)
420
+ %y_idx_257 = tt.reshape %new_idxs_244 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc174)
421
+ %left_idx_258 = arith.muli %y_idx_257, %ileft_246 : tensor<8x2x8xi32> loc(#loc176)
422
+ %left_idx_259 = "tt.reduce"(%left_idx_258) <{axis = 1 : i32}> ({
423
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
424
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
425
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
426
+ }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc206)
427
+ %left_idx_260 = tt.expand_dims %left_idx_259 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc178)
428
+ %left_idx_261 = tt.broadcast %left_idx_260 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc179)
429
+ %right_idx_262 = arith.muli %y_idx_257, %flip_134 : tensor<8x2x8xi32> loc(#loc181)
430
+ %right_idx_263 = "tt.reduce"(%right_idx_262) <{axis = 1 : i32}> ({
431
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
432
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
433
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
434
+ }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc209)
435
+ %right_idx_264 = tt.expand_dims %right_idx_263 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc183)
436
+ %right_idx_265 = tt.broadcast %right_idx_264 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc184)
437
+ %left_idx_266 = tt.reshape %left_idx_261 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc185)
438
+ %right_idx_267 = tt.reshape %right_idx_265 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc186)
439
+ %cond_268 = arith.cmpi slt, %ileft_255, %iright_256 : tensor<8x16xi32> loc(#loc187)
440
+ %eq_269 = arith.cmpi eq, %ileft_255, %iright_256 : tensor<8x16xi32> loc(#loc188)
441
+ %cond_270 = arith.cmpi sgt, %left_idx_266, %right_idx_267 : tensor<8x16xi32> loc(#loc189)
442
+ %cond_271 = arith.andi %eq_269, %cond_270 : tensor<8x16xi1> loc(#loc190)
443
+ %cond_272 = arith.ori %cond_268, %cond_271 : tensor<8x16xi1> loc(#loc191)
444
+ %ret_273 = arith.xori %ileft_255, %iright_256 : tensor<8x16xi32> loc(#loc194)
445
+ %ret_274 = arith.select %cond_272, %ret_273, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
446
+ %ret_275 = arith.xori %ret_241, %ret_274 : tensor<8x16xi32> loc(#loc196)
447
+ %new_idxs_276 = arith.xori %left_idx_266, %right_idx_267 : tensor<8x16xi32> loc(#loc197)
448
+ %new_idxs_277 = arith.select %cond_272, %new_idxs_276, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
449
+ %new_idxs_278 = arith.xori %new_idxs_244, %new_idxs_277 : tensor<8x16xi32> loc(#loc199)
450
+ %y_279 = tt.reshape %ret_275 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc162)
451
+ %ileft_280 = arith.muli %y_279, %ileft_137 : tensor<16x2x4xi32> loc(#loc164)
452
+ %ileft_281 = "tt.reduce"(%ileft_280) <{axis = 1 : i32}> ({
453
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
454
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
455
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
456
+ }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc201)
457
+ %ileft_282 = tt.expand_dims %ileft_281 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc166)
458
+ %ileft_283 = tt.broadcast %ileft_282 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc167)
459
+ %iright_284 = arith.muli %y_279, %flip_59 : tensor<16x2x4xi32> loc(#loc168)
460
+ %iright_285 = "tt.reduce"(%iright_284) <{axis = 1 : i32}> ({
461
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
462
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
463
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
464
+ }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc203)
465
+ %iright_286 = tt.expand_dims %iright_285 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc170)
466
+ %iright_287 = tt.broadcast %iright_286 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc171)
467
+ %ileft_288 = tt.reshape %ileft_283 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc172)
468
+ %iright_289 = tt.reshape %iright_287 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc173)
469
+ %y_idx_290 = tt.reshape %new_idxs_278 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc174)
470
+ %left_idx_291 = arith.muli %y_idx_290, %ileft_137 : tensor<16x2x4xi32> loc(#loc176)
471
+ %left_idx_292 = "tt.reduce"(%left_idx_291) <{axis = 1 : i32}> ({
472
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
473
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
474
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
475
+ }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc206)
476
+ %left_idx_293 = tt.expand_dims %left_idx_292 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc178)
477
+ %left_idx_294 = tt.broadcast %left_idx_293 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc179)
478
+ %right_idx_295 = arith.muli %y_idx_290, %flip_59 : tensor<16x2x4xi32> loc(#loc181)
479
+ %right_idx_296 = "tt.reduce"(%right_idx_295) <{axis = 1 : i32}> ({
480
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
481
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
482
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
483
+ }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc209)
484
+ %right_idx_297 = tt.expand_dims %right_idx_296 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc183)
485
+ %right_idx_298 = tt.broadcast %right_idx_297 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc184)
486
+ %left_idx_299 = tt.reshape %left_idx_294 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc185)
487
+ %right_idx_300 = tt.reshape %right_idx_298 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc186)
488
+ %cond_301 = arith.cmpi slt, %ileft_288, %iright_289 : tensor<8x16xi32> loc(#loc187)
489
+ %eq_302 = arith.cmpi eq, %ileft_288, %iright_289 : tensor<8x16xi32> loc(#loc188)
490
+ %cond_303 = arith.cmpi sgt, %left_idx_299, %right_idx_300 : tensor<8x16xi32> loc(#loc189)
491
+ %cond_304 = arith.andi %eq_302, %cond_303 : tensor<8x16xi1> loc(#loc190)
492
+ %cond_305 = arith.ori %cond_301, %cond_304 : tensor<8x16xi1> loc(#loc191)
493
+ %ret_306 = arith.xori %ileft_288, %iright_289 : tensor<8x16xi32> loc(#loc194)
494
+ %ret_307 = arith.select %cond_305, %ret_306, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
495
+ %ret_308 = arith.xori %ret_275, %ret_307 : tensor<8x16xi32> loc(#loc196)
496
+ %new_idxs_309 = arith.xori %left_idx_299, %right_idx_300 : tensor<8x16xi32> loc(#loc197)
497
+ %new_idxs_310 = arith.select %cond_305, %new_idxs_309, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
498
+ %new_idxs_311 = arith.xori %new_idxs_278, %new_idxs_310 : tensor<8x16xi32> loc(#loc199)
499
+ %y_312 = tt.reshape %ret_308 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc162)
500
+ %ileft_313 = arith.muli %y_312, %ileft_62 : tensor<32x2x2xi32> loc(#loc164)
501
+ %ileft_314 = "tt.reduce"(%ileft_313) <{axis = 1 : i32}> ({
502
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
503
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
504
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
505
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc201)
506
+ %ileft_315 = tt.expand_dims %ileft_314 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc166)
507
+ %ileft_316 = tt.broadcast %ileft_315 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc167)
508
+ %iright_317 = arith.muli %y_312, %flip_22 : tensor<32x2x2xi32> loc(#loc168)
509
+ %iright_318 = "tt.reduce"(%iright_317) <{axis = 1 : i32}> ({
510
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
511
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
512
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
513
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc203)
514
+ %iright_319 = tt.expand_dims %iright_318 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc170)
515
+ %iright_320 = tt.broadcast %iright_319 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc171)
516
+ %ileft_321 = tt.reshape %ileft_316 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc172)
517
+ %iright_322 = tt.reshape %iright_320 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc173)
518
+ %y_idx_323 = tt.reshape %new_idxs_311 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc174)
519
+ %left_idx_324 = arith.muli %y_idx_323, %ileft_62 : tensor<32x2x2xi32> loc(#loc176)
520
+ %left_idx_325 = "tt.reduce"(%left_idx_324) <{axis = 1 : i32}> ({
521
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
522
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
523
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
524
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc206)
525
+ %left_idx_326 = tt.expand_dims %left_idx_325 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc178)
526
+ %left_idx_327 = tt.broadcast %left_idx_326 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc179)
527
+ %right_idx_328 = arith.muli %y_idx_323, %flip_22 : tensor<32x2x2xi32> loc(#loc181)
528
+ %right_idx_329 = "tt.reduce"(%right_idx_328) <{axis = 1 : i32}> ({
529
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
530
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
531
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
532
+ }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc209)
533
+ %right_idx_330 = tt.expand_dims %right_idx_329 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc183)
534
+ %right_idx_331 = tt.broadcast %right_idx_330 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc184)
535
+ %left_idx_332 = tt.reshape %left_idx_327 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc185)
536
+ %right_idx_333 = tt.reshape %right_idx_331 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc186)
537
+ %cond_334 = arith.cmpi slt, %ileft_321, %iright_322 : tensor<8x16xi32> loc(#loc187)
538
+ %eq_335 = arith.cmpi eq, %ileft_321, %iright_322 : tensor<8x16xi32> loc(#loc188)
539
+ %cond_336 = arith.cmpi sgt, %left_idx_332, %right_idx_333 : tensor<8x16xi32> loc(#loc189)
540
+ %cond_337 = arith.andi %eq_335, %cond_336 : tensor<8x16xi1> loc(#loc190)
541
+ %cond_338 = arith.ori %cond_334, %cond_337 : tensor<8x16xi1> loc(#loc191)
542
+ %ret_339 = arith.xori %ileft_321, %iright_322 : tensor<8x16xi32> loc(#loc194)
543
+ %ret_340 = arith.select %cond_338, %ret_339, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
544
+ %ret_341 = arith.xori %ret_308, %ret_340 : tensor<8x16xi32> loc(#loc196)
545
+ %new_idxs_342 = arith.xori %left_idx_332, %right_idx_333 : tensor<8x16xi32> loc(#loc197)
546
+ %new_idxs_343 = arith.select %cond_338, %new_idxs_342, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
547
+ %new_idxs_344 = arith.xori %new_idxs_311, %new_idxs_343 : tensor<8x16xi32> loc(#loc199)
548
+ %y_345 = tt.reshape %ret_341 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162)
549
+ %ileft_346 = arith.muli %y_345, %ileft : tensor<64x2x1xi32> loc(#loc164)
550
+ %ileft_347 = "tt.reduce"(%ileft_346) <{axis = 1 : i32}> ({
551
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
552
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
553
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
554
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201)
555
+ %ileft_348 = tt.expand_dims %ileft_347 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166)
556
+ %ileft_349 = tt.broadcast %ileft_348 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167)
557
+ %iright_350 = arith.muli %y_345, %iright : tensor<64x2x1xi32> loc(#loc168)
558
+ %iright_351 = "tt.reduce"(%iright_350) <{axis = 1 : i32}> ({
559
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
560
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
561
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
562
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203)
563
+ %iright_352 = tt.expand_dims %iright_351 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170)
564
+ %iright_353 = tt.broadcast %iright_352 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171)
565
+ %ileft_354 = tt.reshape %ileft_349 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172)
566
+ %iright_355 = tt.reshape %iright_353 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173)
567
+ %y_idx_356 = tt.reshape %new_idxs_344 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc174)
568
+ %left_idx_357 = arith.muli %y_idx_356, %ileft : tensor<64x2x1xi32> loc(#loc176)
569
+ %left_idx_358 = "tt.reduce"(%left_idx_357) <{axis = 1 : i32}> ({
570
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
571
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
572
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
573
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206)
574
+ %left_idx_359 = tt.expand_dims %left_idx_358 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178)
575
+ %left_idx_360 = tt.broadcast %left_idx_359 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179)
576
+ %right_idx_361 = arith.muli %y_idx_356, %iright : tensor<64x2x1xi32> loc(#loc181)
577
+ %right_idx_362 = "tt.reduce"(%right_idx_361) <{axis = 1 : i32}> ({
578
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
579
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
580
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
581
+ }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209)
582
+ %right_idx_363 = tt.expand_dims %right_idx_362 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183)
583
+ %right_idx_364 = tt.broadcast %right_idx_363 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184)
584
+ %left_idx_365 = tt.reshape %left_idx_360 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185)
585
+ %right_idx_366 = tt.reshape %right_idx_364 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186)
586
+ %cond_367 = arith.cmpi slt, %ileft_354, %iright_355 : tensor<8x16xi32> loc(#loc187)
587
+ %eq_368 = arith.cmpi eq, %ileft_354, %iright_355 : tensor<8x16xi32> loc(#loc188)
588
+ %cond_369 = arith.cmpi sgt, %left_idx_365, %right_idx_366 : tensor<8x16xi32> loc(#loc189)
589
+ %cond_370 = arith.andi %eq_368, %cond_369 : tensor<8x16xi1> loc(#loc190)
590
+ %cond_371 = arith.ori %cond_367, %cond_370 : tensor<8x16xi1> loc(#loc191)
591
+ %new_idxs_372 = arith.xori %left_idx_365, %right_idx_366 : tensor<8x16xi32> loc(#loc197)
592
+ %new_idxs_373 = arith.select %cond_371, %new_idxs_372, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
593
+ %new_idxs_374 = arith.xori %new_idxs_344, %new_idxs_373 : tensor<8x16xi32> loc(#loc199)
594
+ %tmp7 = arith.extsi %tmp0_19 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc149)
595
+ %tmp10_375 = arith.select %tmp0_18, %tmp7, %tmp10 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc86)
596
+ %tmp11 = "tt.reduce"(%tmp10_375) <{axis = 1 : i32}> ({
597
+ ^bb0(%tmp11_377: i64 loc(callsite(#loc1 at #loc150)), %tmp11_378: i64 loc(callsite(#loc1 at #loc150))):
598
+ %tmp11_379 = arith.addi %tmp11_377, %tmp11_378 : i64 loc(#loc200)
599
+ tt.reduce.return %tmp11_379 : i64 loc(#loc160)
600
+ }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc160)
601
+ %tmp11_376 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc151)
602
+ %tmp14 = arith.trunci %tmp11_376 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc152)
603
+ %0 = arith.muli %xindex_6, %cst_2 : tensor<8x1xi32> loc(#loc73)
604
+ %1 = tt.broadcast %r0_index_8 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc74)
605
+ %2 = tt.broadcast %0 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc74)
606
+ %3 = arith.addi %1, %2 : tensor<8x16xi32> loc(#loc74)
607
+ %4 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<8x16x!tt.ptr<i32>> loc(#loc75)
608
+ %5 = tt.addptr %4, %3 : tensor<8x16x!tt.ptr<i32>>, tensor<8x16xi32> loc(#loc75)
609
+ tt.store %5, %new_idxs_374, %tmp0_18 : tensor<8x16x!tt.ptr<i32>> loc(#loc76)
610
+ %6 = tt.splat %out_ptr3 : !tt.ptr<i32> -> tensor<8x1x!tt.ptr<i32>> loc(#loc77)
611
+ %7 = tt.addptr %6, %xindex_6 : tensor<8x1x!tt.ptr<i32>>, tensor<8x1xi32> loc(#loc77)
612
+ tt.store %7, %tmp14, %xmask_7 : tensor<8x1x!tt.ptr<i32>> loc(#loc78)
613
+ tt.return loc(#loc79)
614
+ } loc(#loc)
615
+ } loc(#loc)
616
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":44:34)
617
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:49)
618
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:38)
619
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":26:21)
620
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:28)
621
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:33)
622
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:36)
623
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:44)
624
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:23)
625
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:28)
626
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:38)
627
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":33:19)
628
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":34:19)
629
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:35)
630
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:45)
631
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:30)
632
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:54)
633
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":38:19)
634
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":40:33)
635
+ #loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41)
636
+ #loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44)
637
+ #loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60)
638
+ #loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68)
639
+ #loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22)
640
+ #loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21)
641
+ #loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40)
642
+ #loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
643
+ #loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
644
+ #loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65)
645
+ #loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78)
646
+ #loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41)
647
+ #loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67)
648
+ #loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80)
649
+ #loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30)
650
+ #loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32)
651
+ #loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29)
652
+ #loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36)
653
+ #loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23)
654
+ #loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25)
655
+ #loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53)
656
+ #loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66)
657
+ #loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37)
658
+ #loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23)
659
+ #loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54)
660
+ #loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67)
661
+ #loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36)
662
+ #loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38)
663
+ #loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22)
664
+ #loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21)
665
+ #loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40)
666
+ #loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29)
667
+ #loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23)
668
+ #loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19)
669
+ #loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28)
670
+ #loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38)
671
+ #loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46)
672
+ #loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15)
673
+ #loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48)
674
+ #loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59)
675
+ #loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22)
676
+ #loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":42:19)
677
+ #loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:29)
678
+ #loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":48:21)
679
+ #loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:35)
680
+ #loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:32)
681
+ #loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:25)
682
+ #loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:47)
683
+ #loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:25)
684
+ #loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:37)
685
+ #loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:4)
686
+ #loc85 = loc(callsite(#loc1 at #loc2))
687
+ #loc86 = loc("tmp10"(#loc3))
688
+ #loc87 = loc("tmp0"(#loc4))
689
+ #loc88 = loc("tmp0"(#loc5))
690
+ #loc89 = loc("xmask"(#loc6))
691
+ #loc90 = loc("xoffset"(#loc7))
692
+ #loc91 = loc("xoffset"(#loc8))
693
+ #loc92 = loc("xindex"(#loc9))
694
+ #loc93 = loc("xindex"(#loc10))
695
+ #loc94 = loc("xindex"(#loc11))
696
+ #loc95 = loc("r0_index"(#loc12))
697
+ #loc96 = loc("r0_index"(#loc13))
698
+ #loc97 = loc("x0"(#loc14))
699
+ #loc98 = loc("x1"(#loc15))
700
+ #loc99 = loc("tmp0"(#loc16))
701
+ #loc100 = loc("tmp0"(#loc17))
702
+ #loc101 = loc("tmp0"(#loc18))
703
+ #loc102 = loc("tmp0"(#loc19))
704
+ #loc103 = loc("tmp2"(#loc20))
705
+ #loc104 = loc("tmp4"(#loc21))
706
+ #loc105 = loc("flip"(#loc22))
707
+ #loc107 = loc("flip"(#loc24))
708
+ #loc108 = loc("flip"(#loc25))
709
+ #loc109 = loc("flip"(#loc26))
710
+ #loc110 = loc("y"(#loc27))
711
+ #loc111 = loc("left_mask"(#loc29))
712
+ #loc112 = loc("ileft"(#loc30))
713
+ #loc114 = loc("ileft"(#loc34))
714
+ #loc115 = loc("ileft"(#loc35))
715
+ #loc116 = loc("iright"(#loc36))
716
+ #loc118 = loc("iright"(#loc38))
717
+ #loc119 = loc("iright"(#loc39))
718
+ #loc120 = loc("ileft"(#loc40))
719
+ #loc121 = loc("iright"(#loc41))
720
+ #loc122 = loc("y_idx"(#loc42))
721
+ #loc123 = loc("left_idx"(#loc43))
722
+ #loc124 = loc("left_idx"(#loc44))
723
+ #loc125 = loc("input"(#loc45))
724
+ #loc127 = loc("left_idx"(#loc47))
725
+ #loc128 = loc("left_idx"(#loc48))
726
+ #loc129 = loc("right_idx"(#loc49))
727
+ #loc130 = loc("right_idx"(#loc50))
728
+ #loc132 = loc("right_idx"(#loc52))
729
+ #loc133 = loc("right_idx"(#loc53))
730
+ #loc134 = loc("left_idx"(#loc54))
731
+ #loc135 = loc("right_idx"(#loc55))
732
+ #loc136 = loc("cond"(#loc56))
733
+ #loc137 = loc("eq"(#loc57))
734
+ #loc138 = loc("cond"(#loc58))
735
+ #loc139 = loc("cond"(#loc59))
736
+ #loc140 = loc("cond"(#loc60))
737
+ #loc141 = loc("cond"(#loc61))
738
+ #loc142 = loc("cond"(#loc62))
739
+ #loc143 = loc("ret"(#loc63))
740
+ #loc144 = loc("ret"(#loc64))
741
+ #loc145 = loc("ret"(#loc65))
742
+ #loc146 = loc("new_idxs"(#loc66))
743
+ #loc147 = loc("new_idxs"(#loc67))
744
+ #loc148 = loc("new_idxs"(#loc68))
745
+ #loc149 = loc("tmp7"(#loc69))
746
+ #loc151 = loc("tmp11"(#loc71))
747
+ #loc152 = loc("tmp14"(#loc72))
748
+ #loc153 = loc(callsite(#loc105 at #loc106))
749
+ #loc154 = loc(callsite(#loc107 at #loc106))
750
+ #loc155 = loc(callsite(#loc108 at #loc106))
751
+ #loc156 = loc(callsite(#loc109 at #loc106))
752
+ #loc158 = loc("cond"(#loc136))
753
+ #loc159 = loc("eq"(#loc137))
754
+ #loc160 = loc(callsite(#loc31 at #loc150))
755
+ #loc162 = loc(callsite(#loc110 at #loc157))
756
+ #loc163 = loc(callsite(#loc111 at #loc157))
757
+ #loc164 = loc(callsite(#loc112 at #loc157))
758
+ #loc166 = loc(callsite(#loc114 at #loc157))
759
+ #loc167 = loc(callsite(#loc115 at #loc157))
760
+ #loc168 = loc(callsite(#loc116 at #loc157))
761
+ #loc170 = loc(callsite(#loc118 at #loc157))
762
+ #loc171 = loc(callsite(#loc119 at #loc157))
763
+ #loc172 = loc(callsite(#loc120 at #loc157))
764
+ #loc173 = loc(callsite(#loc121 at #loc157))
765
+ #loc174 = loc(callsite(#loc122 at #loc157))
766
+ #loc175 = loc(callsite(#loc123 at #loc157))
767
+ #loc176 = loc(callsite(#loc124 at #loc157))
768
+ #loc178 = loc(callsite(#loc127 at #loc157))
769
+ #loc179 = loc(callsite(#loc128 at #loc157))
770
+ #loc180 = loc(callsite(#loc129 at #loc157))
771
+ #loc181 = loc(callsite(#loc130 at #loc157))
772
+ #loc183 = loc(callsite(#loc132 at #loc157))
773
+ #loc184 = loc(callsite(#loc133 at #loc157))
774
+ #loc185 = loc(callsite(#loc134 at #loc157))
775
+ #loc186 = loc(callsite(#loc135 at #loc157))
776
+ #loc187 = loc(callsite(#loc158 at #loc157))
777
+ #loc188 = loc(callsite(#loc159 at #loc157))
778
+ #loc189 = loc(callsite(#loc138 at #loc157))
779
+ #loc190 = loc(callsite(#loc139 at #loc157))
780
+ #loc191 = loc(callsite(#loc140 at #loc157))
781
+ #loc192 = loc(callsite(#loc141 at #loc157))
782
+ #loc193 = loc(callsite(#loc142 at #loc157))
783
+ #loc194 = loc(callsite(#loc143 at #loc157))
784
+ #loc195 = loc(callsite(#loc144 at #loc157))
785
+ #loc196 = loc(callsite(#loc145 at #loc157))
786
+ #loc197 = loc(callsite(#loc146 at #loc157))
787
+ #loc198 = loc(callsite(#loc147 at #loc157))
788
+ #loc199 = loc(callsite(#loc148 at #loc157))
789
+ #loc200 = loc(callsite(#loc33 at #loc160))
790
+ #loc201 = loc(callsite(#loc31 at #loc165))
791
+ #loc203 = loc(callsite(#loc31 at #loc169))
792
+ #loc205 = loc(callsite(#loc125 at #loc177))
793
+ #loc206 = loc(callsite(#loc31 at #loc177))
794
+ #loc208 = loc(callsite(#loc125 at #loc182))
795
+ #loc209 = loc(callsite(#loc31 at #loc182))
796
+ #loc211 = loc(callsite(#loc33 at #loc201))
797
+ #loc212 = loc(callsite(#loc33 at #loc203))
798
+ #loc213 = loc(callsite(#loc33 at #loc206))
799
+ #loc214 = loc(callsite(#loc33 at #loc209))
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/__grp__triton_red_fused_argmax_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.json"}}
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.cubin ADDED
Binary file (33.6 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "dce51101d74002d90b1fe021a32b8040ce453636ab0b0ab209427e2e72d803f0", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"}
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.llir ADDED
@@ -0,0 +1,611 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @global_smem = external addrspace(3) global [0 x i8], align 16
6
+
7
+ ; Function Attrs: nounwind
8
+ define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
9
+ %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
10
+ %8 = shl i32 %7, 3, !dbg !8
11
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
12
+ %10 = and i32 %9, 384, !dbg !9
13
+ %11 = lshr exact i32 %10, 7, !dbg !9
14
+ %12 = or disjoint i32 %11, 4, !dbg !9
15
+ %13 = or disjoint i32 %11, %8, !dbg !10
16
+ %14 = or disjoint i32 %12, %8, !dbg !10
17
+ %15 = shl nuw nsw i32 %9, 2, !dbg !11
18
+ %16 = and i32 %15, 508, !dbg !11
19
+ %17 = sdiv i32 %13, 2048, !dbg !12
20
+ %18 = sdiv i32 %14, 2048, !dbg !12
21
+ %19 = mul i32 %13, 32000
22
+ %20 = mul i32 %17, 224000
23
+ %21 = add i32 %20, %19
24
+ %22 = mul i32 %14, 32000
25
+ %23 = mul i32 %18, 224000
26
+ %24 = add i32 %23, %22
27
+ %25 = zext nneg i32 %16 to i64, !dbg !13
28
+ br label %26, !dbg !13
29
+
30
+ 26: ; preds = %6, %26
31
+ %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %26 ]
32
+ %27 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %126, %26 ]
33
+ %28 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %131, %26 ]
34
+ %29 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %127, %26 ]
35
+ %30 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %132, %26 ]
36
+ %31 = phi <4 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %130, %26 ]
37
+ %32 = phi <4 x i32> [ splat (i32 2147483647), %6 ], [ %133, %26 ]
38
+ %33 = or disjoint i64 %indvars.iv, %25, !dbg !14
39
+ %34 = icmp samesign ult i64 %33, 32000, !dbg !15
40
+ %35 = trunc nuw nsw i64 %33 to i32, !dbg !16
41
+ %36 = add i32 %21, %35, !dbg !16
42
+ %37 = add i32 %24, %35, !dbg !16
43
+ %38 = sext i32 %36 to i64, !dbg !17
44
+ %39 = getelementptr float, ptr addrspace(1) %0, i64 %38, !dbg !17
45
+ %40 = sext i32 %37 to i64, !dbg !17
46
+ %41 = getelementptr float, ptr addrspace(1) %0, i64 %40, !dbg !17
47
+ %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !18
48
+ %43 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %39, i64 %42, i1 %34) #4, !dbg !18
49
+ %44 = extractvalue { i32, i32, i32, i32 } %43, 0, !dbg !18
50
+ %45 = extractvalue { i32, i32, i32, i32 } %43, 1, !dbg !18
51
+ %46 = extractvalue { i32, i32, i32, i32 } %43, 2, !dbg !18
52
+ %47 = extractvalue { i32, i32, i32, i32 } %43, 3, !dbg !18
53
+ %48 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !18
54
+ %49 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %41, i64 %48, i1 %34) #4, !dbg !18
55
+ %50 = extractvalue { i32, i32, i32, i32 } %49, 0, !dbg !18
56
+ %51 = extractvalue { i32, i32, i32, i32 } %49, 1, !dbg !18
57
+ %52 = extractvalue { i32, i32, i32, i32 } %49, 2, !dbg !18
58
+ %53 = extractvalue { i32, i32, i32, i32 } %49, 3, !dbg !18
59
+ %54 = fcmp uno <2 x float> %27, zeroinitializer, !dbg !19
60
+ %55 = fcmp uno <4 x float> %31, zeroinitializer, !dbg !19
61
+ %56 = fcmp uno <2 x float> %29, zeroinitializer, !dbg !19
62
+ %57 = sext <2 x i32> %28 to <2 x i64>, !dbg !23
63
+ %58 = sext <2 x i32> %30 to <2 x i64>, !dbg !23
64
+ %59 = trunc nuw nsw i64 %33 to i32, !dbg !24
65
+ %60 = or disjoint i32 %59, 1, !dbg !24
66
+ %61 = insertelement <2 x i32> poison, i32 %44, i64 0, !dbg !18
67
+ %62 = insertelement <2 x i32> %61, i32 %45, i64 1, !dbg !18
68
+ %63 = bitcast <2 x i32> %62 to <2 x float>, !dbg !18
69
+ %64 = fcmp ogt <2 x float> %27, %63, !dbg !25
70
+ %65 = fcmp oeq <2 x float> %27, %63, !dbg !26
71
+ %66 = fcmp uno <2 x float> %63, zeroinitializer, !dbg !27
72
+ %67 = xor <2 x i1> %66, splat (i1 true), !dbg !28
73
+ %68 = and <2 x i1> %54, %67, !dbg !29
74
+ %69 = or <2 x i1> %64, %68, !dbg !30
75
+ %70 = and <2 x i1> %54, %66, !dbg !31
76
+ %71 = or <2 x i1> %65, %70, !dbg !32
77
+ %72 = insertelement <2 x i64> poison, i64 %33, i64 0, !dbg !23
78
+ %73 = shufflevector <2 x i64> %72, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !23
79
+ %74 = icmp sgt <2 x i64> %73, %57, !dbg !23
80
+ %75 = icmp sge <2 x i64> %73, %57, !dbg !23
81
+ %76 = shufflevector <2 x i1> %74, <2 x i1> %75, <2 x i32> <i32 0, i32 3>, !dbg !23
82
+ %77 = and <2 x i1> %76, %71, !dbg !33
83
+ %78 = or <2 x i1> %69, %77, !dbg !34
84
+ %79 = select <2 x i1> %78, <2 x float> %27, <2 x float> %63, !dbg !35
85
+ %80 = insertelement <2 x i32> poison, i32 %35, i64 0, !dbg !24
86
+ %81 = insertelement <2 x i32> %80, i32 %60, i64 1, !dbg !24
87
+ %82 = select <2 x i1> %78, <2 x i32> %28, <2 x i32> %81, !dbg !24
88
+ %83 = insertelement <2 x i32> poison, i32 %50, i64 0, !dbg !18
89
+ %84 = insertelement <2 x i32> %83, i32 %51, i64 1, !dbg !18
90
+ %85 = bitcast <2 x i32> %84 to <2 x float>, !dbg !18
91
+ %86 = fcmp ogt <2 x float> %29, %85, !dbg !25
92
+ %87 = fcmp oeq <2 x float> %29, %85, !dbg !26
93
+ %88 = fcmp uno <2 x float> %85, zeroinitializer, !dbg !27
94
+ %89 = xor <2 x i1> %88, splat (i1 true), !dbg !28
95
+ %90 = and <2 x i1> %56, %89, !dbg !29
96
+ %91 = or <2 x i1> %86, %90, !dbg !30
97
+ %92 = and <2 x i1> %56, %88, !dbg !31
98
+ %93 = or <2 x i1> %87, %92, !dbg !32
99
+ %94 = icmp sgt <2 x i64> %73, %58, !dbg !23
100
+ %95 = icmp sge <2 x i64> %73, %58, !dbg !23
101
+ %96 = shufflevector <2 x i1> %94, <2 x i1> %95, <2 x i32> <i32 0, i32 3>, !dbg !23
102
+ %97 = and <2 x i1> %96, %93, !dbg !33
103
+ %98 = or <2 x i1> %91, %97, !dbg !34
104
+ %99 = select <2 x i1> %98, <2 x float> %29, <2 x float> %85, !dbg !35
105
+ %100 = select <2 x i1> %98, <2 x i32> %30, <2 x i32> %81, !dbg !24
106
+ %101 = or disjoint <2 x i64> %73, <i64 3, i64 2>, !dbg !14
107
+ %102 = shufflevector <2 x i64> %101, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>, !dbg !14
108
+ %103 = insertelement <4 x i32> poison, i32 %53, i64 0, !dbg !18
109
+ %104 = insertelement <4 x i32> %103, i32 %52, i64 1, !dbg !18
110
+ %105 = insertelement <4 x i32> %104, i32 %47, i64 2, !dbg !18
111
+ %106 = insertelement <4 x i32> %105, i32 %46, i64 3, !dbg !18
112
+ %107 = bitcast <4 x i32> %106 to <4 x float>, !dbg !18
113
+ %108 = fcmp ogt <4 x float> %31, %107, !dbg !25
114
+ %109 = fcmp oeq <4 x float> %31, %107, !dbg !26
115
+ %110 = fcmp uno <4 x float> %107, zeroinitializer, !dbg !27
116
+ %111 = xor <4 x i1> %110, splat (i1 true), !dbg !28
117
+ %112 = and <4 x i1> %55, %111, !dbg !29
118
+ %113 = or <4 x i1> %108, %112, !dbg !30
119
+ %114 = and <4 x i1> %55, %110, !dbg !31
120
+ %115 = or <4 x i1> %109, %114, !dbg !32
121
+ %116 = sext <4 x i32> %32 to <4 x i64>, !dbg !23
122
+ %117 = icmp sgt <4 x i64> %102, %116, !dbg !23
123
+ %118 = and <4 x i1> %117, %115, !dbg !33
124
+ %119 = or <4 x i1> %113, %118, !dbg !34
125
+ %120 = select <4 x i1> %119, <4 x float> %31, <4 x float> %107, !dbg !35
126
+ %121 = trunc <2 x i64> %101 to <2 x i32>, !dbg !24
127
+ %122 = shufflevector <2 x i32> %121, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>, !dbg !24
128
+ %123 = select <4 x i1> %119, <4 x i32> %32, <4 x i32> %122, !dbg !24
129
+ %124 = insertelement <2 x i1> poison, i1 %34, i64 0, !dbg !36
130
+ %125 = shufflevector <2 x i1> %124, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !36
131
+ %126 = select <2 x i1> %125, <2 x float> %79, <2 x float> %27, !dbg !36
132
+ %127 = select <2 x i1> %125, <2 x float> %99, <2 x float> %29, !dbg !36
133
+ %128 = insertelement <4 x i1> poison, i1 %34, i64 0, !dbg !36
134
+ %129 = shufflevector <4 x i1> %128, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !36
135
+ %130 = select <4 x i1> %129, <4 x float> %120, <4 x float> %31, !dbg !36
136
+ %131 = select <2 x i1> %125, <2 x i32> %82, <2 x i32> %28, !dbg !37
137
+ %132 = select <2 x i1> %125, <2 x i32> %100, <2 x i32> %30, !dbg !37
138
+ %133 = select <4 x i1> %129, <4 x i32> %123, <4 x i32> %32, !dbg !37
139
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 512, !dbg !13
140
+ %134 = icmp samesign ult i64 %indvars.iv, 31488, !dbg !13
141
+ br i1 %134, label %26, label %135, !dbg !13
142
+
143
+ 135: ; preds = %26
144
+ %136 = and i32 %9, 7, !dbg !9
145
+ %137 = or disjoint i32 %8, %136, !dbg !10
146
+ %138 = and i32 %9, 31, !dbg !9
147
+ %139 = lshr i32 %9, 5, !dbg !9
148
+ %140 = shufflevector <2 x float> %126, <2 x float> poison, <2 x i32> <i32 1, i32 0>, !dbg !38
149
+ %141 = fcmp ogt <2 x float> %126, %140, !dbg !38
150
+ %142 = fcmp oeq <2 x float> %126, %140, !dbg !38
151
+ %143 = shufflevector <2 x i1> %141, <2 x i1> %142, <2 x i32> <i32 0, i32 3>, !dbg !38
152
+ %144 = extractelement <2 x float> %126, i64 0, !dbg !40
153
+ %145 = fcmp uno float %144, 0.000000e+00, !dbg !40
154
+ %146 = extractelement <2 x float> %126, i64 1, !dbg !41
155
+ %147 = fcmp uno float %146, 0.000000e+00, !dbg !41
156
+ %148 = xor i1 %147, true, !dbg !42
157
+ %149 = insertelement <2 x i1> poison, i1 %145, i64 0, !dbg !43
158
+ %150 = shufflevector <2 x i1> %149, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !43
159
+ %151 = insertelement <2 x i1> poison, i1 %148, i64 0, !dbg !43
160
+ %152 = insertelement <2 x i1> %151, i1 %147, i64 1, !dbg !43
161
+ %153 = and <2 x i1> %150, %152, !dbg !43
162
+ %154 = or <2 x i1> %143, %153, !dbg !44
163
+ %155 = extractelement <2 x i32> %131, i64 0, !dbg !45
164
+ %156 = extractelement <2 x i32> %131, i64 1, !dbg !45
165
+ %157 = icmp slt i32 %155, %156, !dbg !45
166
+ %158 = extractelement <2 x i1> %154, i64 1, !dbg !46
167
+ %159 = and i1 %157, %158, !dbg !46
168
+ %160 = extractelement <2 x i1> %154, i64 0, !dbg !47
169
+ %161 = or i1 %160, %159, !dbg !47
170
+ %162 = select i1 %161, float %144, float %146, !dbg !48
171
+ %163 = select i1 %161, i32 %155, i32 %156, !dbg !49
172
+ %164 = extractelement <4 x float> %130, i64 3, !dbg !38
173
+ %165 = fcmp ogt float %162, %164, !dbg !38
174
+ %166 = fcmp oeq float %162, %164, !dbg !50
175
+ %167 = fcmp uno float %162, 0.000000e+00, !dbg !40
176
+ %168 = fcmp uno <4 x float> %130, zeroinitializer, !dbg !41
177
+ %169 = extractelement <4 x i1> %168, i64 3, !dbg !51
178
+ %170 = xor i1 %169, true, !dbg !42
179
+ %171 = and i1 %167, %170, !dbg !43
180
+ %172 = or i1 %165, %171, !dbg !44
181
+ %173 = and i1 %169, %167, !dbg !51
182
+ %174 = or i1 %166, %173, !dbg !52
183
+ %175 = extractelement <4 x i32> %133, i64 3, !dbg !45
184
+ %176 = icmp slt i32 %163, %175, !dbg !45
185
+ %177 = and i1 %176, %174, !dbg !46
186
+ %178 = or i1 %172, %177, !dbg !47
187
+ %179 = select i1 %178, float %162, float %164, !dbg !48
188
+ %180 = select i1 %178, i32 %163, i32 %175, !dbg !49
189
+ %181 = extractelement <4 x float> %130, i64 2, !dbg !38
190
+ %182 = fcmp ogt float %179, %181, !dbg !38
191
+ %183 = fcmp oeq float %179, %181, !dbg !50
192
+ %184 = fcmp uno float %179, 0.000000e+00, !dbg !40
193
+ %185 = extractelement <4 x i1> %168, i64 2, !dbg !51
194
+ %186 = xor i1 %185, true, !dbg !42
195
+ %187 = and i1 %184, %186, !dbg !43
196
+ %188 = or i1 %182, %187, !dbg !44
197
+ %189 = and i1 %185, %184, !dbg !51
198
+ %190 = or i1 %183, %189, !dbg !52
199
+ %191 = extractelement <4 x i32> %133, i64 2, !dbg !45
200
+ %192 = icmp slt i32 %180, %191, !dbg !45
201
+ %193 = and i1 %192, %190, !dbg !46
202
+ %194 = or i1 %188, %193, !dbg !47
203
+ %195 = select i1 %194, float %179, float %181, !dbg !48
204
+ %196 = select i1 %194, i32 %180, i32 %191, !dbg !49
205
+ %197 = shufflevector <2 x float> %127, <2 x float> poison, <2 x i32> <i32 1, i32 0>, !dbg !38
206
+ %198 = fcmp ogt <2 x float> %127, %197, !dbg !38
207
+ %199 = fcmp oeq <2 x float> %127, %197, !dbg !38
208
+ %200 = shufflevector <2 x i1> %198, <2 x i1> %199, <2 x i32> <i32 0, i32 3>, !dbg !38
209
+ %201 = extractelement <2 x float> %127, i64 0, !dbg !40
210
+ %202 = fcmp uno float %201, 0.000000e+00, !dbg !40
211
+ %203 = extractelement <2 x float> %127, i64 1, !dbg !41
212
+ %204 = fcmp uno float %203, 0.000000e+00, !dbg !41
213
+ %205 = xor i1 %204, true, !dbg !42
214
+ %206 = insertelement <2 x i1> poison, i1 %202, i64 0, !dbg !43
215
+ %207 = shufflevector <2 x i1> %206, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !43
216
+ %208 = insertelement <2 x i1> poison, i1 %205, i64 0, !dbg !43
217
+ %209 = insertelement <2 x i1> %208, i1 %204, i64 1, !dbg !43
218
+ %210 = and <2 x i1> %207, %209, !dbg !43
219
+ %211 = or <2 x i1> %200, %210, !dbg !44
220
+ %212 = extractelement <2 x i32> %132, i64 0, !dbg !45
221
+ %213 = extractelement <2 x i32> %132, i64 1, !dbg !45
222
+ %214 = icmp slt i32 %212, %213, !dbg !45
223
+ %215 = extractelement <2 x i1> %211, i64 1, !dbg !46
224
+ %216 = and i1 %214, %215, !dbg !46
225
+ %217 = extractelement <2 x i1> %211, i64 0, !dbg !47
226
+ %218 = or i1 %217, %216, !dbg !47
227
+ %219 = select i1 %218, float %201, float %203, !dbg !48
228
+ %220 = select i1 %218, i32 %212, i32 %213, !dbg !49
229
+ %221 = extractelement <4 x float> %130, i64 1, !dbg !38
230
+ %222 = fcmp ogt float %219, %221, !dbg !38
231
+ %223 = fcmp oeq float %219, %221, !dbg !50
232
+ %224 = fcmp uno float %219, 0.000000e+00, !dbg !40
233
+ %225 = extractelement <4 x i1> %168, i64 1, !dbg !51
234
+ %226 = xor i1 %225, true, !dbg !42
235
+ %227 = and i1 %224, %226, !dbg !43
236
+ %228 = or i1 %222, %227, !dbg !44
237
+ %229 = and i1 %225, %224, !dbg !51
238
+ %230 = or i1 %223, %229, !dbg !52
239
+ %231 = extractelement <4 x i32> %133, i64 1, !dbg !45
240
+ %232 = icmp slt i32 %220, %231, !dbg !45
241
+ %233 = and i1 %232, %230, !dbg !46
242
+ %234 = or i1 %228, %233, !dbg !47
243
+ %235 = select i1 %234, float %219, float %221, !dbg !48
244
+ %236 = select i1 %234, i32 %220, i32 %231, !dbg !49
245
+ %237 = extractelement <4 x float> %130, i64 0, !dbg !38
246
+ %238 = fcmp ogt float %235, %237, !dbg !38
247
+ %239 = fcmp oeq float %235, %237, !dbg !50
248
+ %240 = fcmp uno float %235, 0.000000e+00, !dbg !40
249
+ %241 = extractelement <4 x i1> %168, i64 0, !dbg !51
250
+ %242 = xor i1 %241, true, !dbg !42
251
+ %243 = and i1 %240, %242, !dbg !43
252
+ %244 = or i1 %238, %243, !dbg !44
253
+ %245 = and i1 %241, %240, !dbg !51
254
+ %246 = or i1 %239, %245, !dbg !52
255
+ %247 = extractelement <4 x i32> %133, i64 0, !dbg !45
256
+ %248 = icmp slt i32 %236, %247, !dbg !45
257
+ %249 = and i1 %248, %246, !dbg !46
258
+ %250 = or i1 %244, %249, !dbg !47
259
+ %251 = select i1 %250, float %235, float %237, !dbg !48
260
+ %252 = select i1 %250, i32 %236, i32 %247, !dbg !49
261
+ %253 = bitcast float %195 to i32, !dbg !53
262
+ %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %253, i32 16, i32 31), !dbg !53
263
+ %255 = bitcast i32 %254 to float, !dbg !53
264
+ %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 16, i32 31), !dbg !53
265
+ %257 = fcmp ogt float %195, %255, !dbg !38
266
+ %258 = fcmp oeq float %195, %255, !dbg !50
267
+ %259 = fcmp uno float %195, 0.000000e+00, !dbg !40
268
+ %260 = fcmp uno float %255, 0.000000e+00, !dbg !41
269
+ %261 = xor i1 %260, true, !dbg !42
270
+ %262 = and i1 %259, %261, !dbg !43
271
+ %263 = or i1 %257, %262, !dbg !44
272
+ %264 = and i1 %259, %260, !dbg !51
273
+ %265 = or i1 %258, %264, !dbg !52
274
+ %266 = icmp slt i32 %196, %256, !dbg !45
275
+ %267 = and i1 %266, %265, !dbg !46
276
+ %268 = or i1 %263, %267, !dbg !47
277
+ %269 = select i1 %268, float %195, float %255, !dbg !48
278
+ %270 = select i1 %268, i32 %196, i32 %256, !dbg !49
279
+ %271 = bitcast float %269 to i32, !dbg !53
280
+ %272 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %271, i32 8, i32 31), !dbg !53
281
+ %273 = bitcast i32 %272 to float, !dbg !53
282
+ %274 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %270, i32 8, i32 31), !dbg !53
283
+ %275 = fcmp ogt float %269, %273, !dbg !38
284
+ %276 = fcmp oeq float %269, %273, !dbg !50
285
+ %277 = fcmp uno float %269, 0.000000e+00, !dbg !40
286
+ %278 = fcmp uno float %273, 0.000000e+00, !dbg !41
287
+ %279 = xor i1 %278, true, !dbg !42
288
+ %280 = and i1 %277, %279, !dbg !43
289
+ %281 = or i1 %275, %280, !dbg !44
290
+ %282 = and i1 %278, %277, !dbg !51
291
+ %283 = or i1 %276, %282, !dbg !52
292
+ %284 = icmp slt i32 %270, %274, !dbg !45
293
+ %285 = and i1 %284, %283, !dbg !46
294
+ %286 = or i1 %281, %285, !dbg !47
295
+ %287 = select i1 %286, float %269, float %273, !dbg !48
296
+ %288 = select i1 %286, i32 %270, i32 %274, !dbg !49
297
+ %289 = bitcast float %287 to i32, !dbg !53
298
+ %290 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %289, i32 4, i32 31), !dbg !53
299
+ %291 = bitcast i32 %290 to float, !dbg !53
300
+ %292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %288, i32 4, i32 31), !dbg !53
301
+ %293 = fcmp ogt float %287, %291, !dbg !38
302
+ %294 = fcmp oeq float %287, %291, !dbg !50
303
+ %295 = fcmp uno float %287, 0.000000e+00, !dbg !40
304
+ %296 = fcmp uno float %291, 0.000000e+00, !dbg !41
305
+ %297 = xor i1 %296, true, !dbg !42
306
+ %298 = and i1 %295, %297, !dbg !43
307
+ %299 = or i1 %293, %298, !dbg !44
308
+ %300 = and i1 %296, %295, !dbg !51
309
+ %301 = or i1 %294, %300, !dbg !52
310
+ %302 = icmp slt i32 %288, %292, !dbg !45
311
+ %303 = and i1 %302, %301, !dbg !46
312
+ %304 = or i1 %299, %303, !dbg !47
313
+ %305 = select i1 %304, float %287, float %291, !dbg !48
314
+ %306 = select i1 %304, i32 %288, i32 %292, !dbg !49
315
+ %307 = bitcast float %305 to i32, !dbg !53
316
+ %308 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %307, i32 2, i32 31), !dbg !53
317
+ %309 = bitcast i32 %308 to float, !dbg !53
318
+ %310 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 2, i32 31), !dbg !53
319
+ %311 = fcmp ogt float %305, %309, !dbg !38
320
+ %312 = fcmp oeq float %305, %309, !dbg !50
321
+ %313 = fcmp uno float %305, 0.000000e+00, !dbg !40
322
+ %314 = fcmp uno float %309, 0.000000e+00, !dbg !41
323
+ %315 = xor i1 %314, true, !dbg !42
324
+ %316 = and i1 %313, %315, !dbg !43
325
+ %317 = or i1 %311, %316, !dbg !44
326
+ %318 = and i1 %314, %313, !dbg !51
327
+ %319 = or i1 %312, %318, !dbg !52
328
+ %320 = icmp slt i32 %306, %310, !dbg !45
329
+ %321 = and i1 %320, %319, !dbg !46
330
+ %322 = or i1 %317, %321, !dbg !47
331
+ %323 = select i1 %322, float %305, float %309, !dbg !48
332
+ %324 = select i1 %322, i32 %306, i32 %310, !dbg !49
333
+ %325 = bitcast float %323 to i32, !dbg !53
334
+ %326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 1, i32 31), !dbg !53
335
+ %327 = bitcast i32 %326 to float, !dbg !53
336
+ %328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 1, i32 31), !dbg !53
337
+ %329 = fcmp ogt float %323, %327, !dbg !38
338
+ %330 = fcmp oeq float %323, %327, !dbg !50
339
+ %331 = fcmp uno float %323, 0.000000e+00, !dbg !40
340
+ %332 = fcmp uno float %327, 0.000000e+00, !dbg !41
341
+ %333 = xor i1 %332, true, !dbg !42
342
+ %334 = and i1 %331, %333, !dbg !43
343
+ %335 = or i1 %329, %334, !dbg !44
344
+ %336 = and i1 %332, %331, !dbg !51
345
+ %337 = or i1 %330, %336, !dbg !52
346
+ %338 = icmp slt i32 %324, %328, !dbg !45
347
+ %339 = and i1 %338, %337, !dbg !46
348
+ %340 = or i1 %335, %339, !dbg !47
349
+ %341 = select i1 %340, i32 %324, i32 %328, !dbg !49
350
+ %342 = bitcast float %251 to i32, !dbg !53
351
+ %343 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %342, i32 16, i32 31), !dbg !53
352
+ %344 = bitcast i32 %343 to float, !dbg !53
353
+ %345 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 16, i32 31), !dbg !53
354
+ %346 = fcmp ogt float %251, %344, !dbg !38
355
+ %347 = fcmp oeq float %251, %344, !dbg !50
356
+ %348 = fcmp uno float %251, 0.000000e+00, !dbg !40
357
+ %349 = fcmp uno float %344, 0.000000e+00, !dbg !41
358
+ %350 = xor i1 %349, true, !dbg !42
359
+ %351 = and i1 %348, %350, !dbg !43
360
+ %352 = or i1 %346, %351, !dbg !44
361
+ %353 = and i1 %348, %349, !dbg !51
362
+ %354 = or i1 %347, %353, !dbg !52
363
+ %355 = icmp slt i32 %252, %345, !dbg !45
364
+ %356 = and i1 %355, %354, !dbg !46
365
+ %357 = or i1 %352, %356, !dbg !47
366
+ %358 = select i1 %357, float %251, float %344, !dbg !48
367
+ %359 = select i1 %357, i32 %252, i32 %345, !dbg !49
368
+ %360 = bitcast float %358 to i32, !dbg !53
369
+ %361 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %360, i32 8, i32 31), !dbg !53
370
+ %362 = bitcast i32 %361 to float, !dbg !53
371
+ %363 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %359, i32 8, i32 31), !dbg !53
372
+ %364 = fcmp ogt float %358, %362, !dbg !38
373
+ %365 = fcmp oeq float %358, %362, !dbg !50
374
+ %366 = fcmp uno float %358, 0.000000e+00, !dbg !40
375
+ %367 = fcmp uno float %362, 0.000000e+00, !dbg !41
376
+ %368 = xor i1 %367, true, !dbg !42
377
+ %369 = and i1 %366, %368, !dbg !43
378
+ %370 = or i1 %364, %369, !dbg !44
379
+ %371 = and i1 %367, %366, !dbg !51
380
+ %372 = or i1 %365, %371, !dbg !52
381
+ %373 = icmp slt i32 %359, %363, !dbg !45
382
+ %374 = and i1 %373, %372, !dbg !46
383
+ %375 = or i1 %370, %374, !dbg !47
384
+ %376 = select i1 %375, float %358, float %362, !dbg !48
385
+ %377 = select i1 %375, i32 %359, i32 %363, !dbg !49
386
+ %378 = bitcast float %376 to i32, !dbg !53
387
+ %379 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %378, i32 4, i32 31), !dbg !53
388
+ %380 = bitcast i32 %379 to float, !dbg !53
389
+ %381 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %377, i32 4, i32 31), !dbg !53
390
+ %382 = fcmp ogt float %376, %380, !dbg !38
391
+ %383 = fcmp oeq float %376, %380, !dbg !50
392
+ %384 = fcmp uno float %376, 0.000000e+00, !dbg !40
393
+ %385 = fcmp uno float %380, 0.000000e+00, !dbg !41
394
+ %386 = xor i1 %385, true, !dbg !42
395
+ %387 = and i1 %384, %386, !dbg !43
396
+ %388 = or i1 %382, %387, !dbg !44
397
+ %389 = and i1 %385, %384, !dbg !51
398
+ %390 = or i1 %383, %389, !dbg !52
399
+ %391 = icmp slt i32 %377, %381, !dbg !45
400
+ %392 = and i1 %391, %390, !dbg !46
401
+ %393 = or i1 %388, %392, !dbg !47
402
+ %394 = select i1 %393, float %376, float %380, !dbg !48
403
+ %395 = select i1 %393, i32 %377, i32 %381, !dbg !49
404
+ %396 = bitcast float %394 to i32, !dbg !53
405
+ %397 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %396, i32 2, i32 31), !dbg !53
406
+ %398 = bitcast i32 %397 to float, !dbg !53
407
+ %399 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %395, i32 2, i32 31), !dbg !53
408
+ %400 = fcmp ogt float %394, %398, !dbg !38
409
+ %401 = fcmp oeq float %394, %398, !dbg !50
410
+ %402 = fcmp uno float %394, 0.000000e+00, !dbg !40
411
+ %403 = fcmp uno float %398, 0.000000e+00, !dbg !41
412
+ %404 = xor i1 %403, true, !dbg !42
413
+ %405 = and i1 %402, %404, !dbg !43
414
+ %406 = or i1 %400, %405, !dbg !44
415
+ %407 = and i1 %403, %402, !dbg !51
416
+ %408 = or i1 %401, %407, !dbg !52
417
+ %409 = icmp slt i32 %395, %399, !dbg !45
418
+ %410 = and i1 %409, %408, !dbg !46
419
+ %411 = or i1 %406, %410, !dbg !47
420
+ %412 = select i1 %411, float %394, float %398, !dbg !48
421
+ %413 = select i1 %411, i32 %395, i32 %399, !dbg !49
422
+ %414 = bitcast float %412 to i32, !dbg !53
423
+ %415 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %414, i32 1, i32 31), !dbg !53
424
+ %416 = bitcast i32 %415 to float, !dbg !53
425
+ %417 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %413, i32 1, i32 31), !dbg !53
426
+ %418 = fcmp ogt float %412, %416, !dbg !38
427
+ %419 = fcmp oeq float %412, %416, !dbg !50
428
+ %420 = fcmp uno float %412, 0.000000e+00, !dbg !40
429
+ %421 = fcmp uno float %416, 0.000000e+00, !dbg !41
430
+ %422 = xor i1 %421, true, !dbg !42
431
+ %423 = and i1 %420, %422, !dbg !43
432
+ %424 = or i1 %418, %423, !dbg !44
433
+ %425 = and i1 %421, %420, !dbg !51
434
+ %426 = or i1 %419, %425, !dbg !52
435
+ %427 = icmp slt i32 %413, %417, !dbg !45
436
+ %428 = and i1 %427, %426, !dbg !46
437
+ %429 = or i1 %424, %428, !dbg !47
438
+ %430 = select i1 %429, i32 %413, i32 %417, !dbg !49
439
+ %431 = and i32 %139, 3, !dbg !53
440
+ %432 = icmp eq i32 %138, 0, !dbg !53
441
+ %433 = lshr exact i32 %10, 5, !dbg !53
442
+ %434 = or disjoint i32 %433, %431, !dbg !53
443
+ %435 = getelementptr float, ptr addrspace(3) @global_smem, i32 %434, !dbg !53
444
+ %436 = select i1 %340, i32 %325, i32 %326, !dbg !48
445
+ %437 = insertelement <1 x i32> poison, i32 %436, i64 0, !dbg !53
446
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %435, <1 x i32> %437, i1 %432) #4, !dbg !53
447
+ %438 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %434, !dbg !53
448
+ %439 = insertelement <1 x i32> poison, i32 %341, i64 0, !dbg !53
449
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %438, <1 x i32> %439, i1 %432) #4, !dbg !53
450
+ %440 = shl nuw nsw i32 %12, 2, !dbg !53
451
+ %441 = or disjoint i32 %440, %431, !dbg !53
452
+ %442 = getelementptr float, ptr addrspace(3) @global_smem, i32 %441, !dbg !53
453
+ %443 = select i1 %429, i32 %414, i32 %415, !dbg !48
454
+ %444 = insertelement <1 x i32> poison, i32 %443, i64 0, !dbg !53
455
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %442, <1 x i32> %444, i1 %432) #4, !dbg !53
456
+ %445 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %441, !dbg !53
457
+ %446 = insertelement <1 x i32> poison, i32 %430, i64 0, !dbg !53
458
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %445, <1 x i32> %446, i1 %432) #4, !dbg !53
459
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
460
+ %447 = icmp samesign ult i32 %9, 32, !dbg !53
461
+ %448 = getelementptr float, ptr addrspace(3) @global_smem, i32 %9, !dbg !53
462
+ %449 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %448, i1 %447) #4, !dbg !53
463
+ %450 = bitcast i32 %449 to float, !dbg !53
464
+ %451 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %9, !dbg !53
465
+ %452 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %451, i1 %447) #4, !dbg !53
466
+ %453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %449, i32 2, i32 31), !dbg !53
467
+ %454 = bitcast i32 %453 to float, !dbg !53
468
+ %455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 2, i32 31), !dbg !53
469
+ %456 = fcmp ogt float %450, %454, !dbg !38
470
+ %457 = fcmp oeq float %450, %454, !dbg !50
471
+ %458 = fcmp uno float %450, 0.000000e+00, !dbg !40
472
+ %459 = fcmp uno float %454, 0.000000e+00, !dbg !41
473
+ %460 = xor i1 %459, true, !dbg !42
474
+ %461 = and i1 %458, %460, !dbg !43
475
+ %462 = or i1 %456, %461, !dbg !44
476
+ %463 = and i1 %458, %459, !dbg !51
477
+ %464 = or i1 %457, %463, !dbg !52
478
+ %465 = icmp slt i32 %452, %455, !dbg !45
479
+ %466 = and i1 %465, %464, !dbg !46
480
+ %467 = or i1 %462, %466, !dbg !47
481
+ %468 = select i1 %467, float %450, float %454, !dbg !48
482
+ %469 = select i1 %467, i32 %452, i32 %455, !dbg !49
483
+ %470 = bitcast float %468 to i32, !dbg !53
484
+ %471 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 1, i32 31), !dbg !53
485
+ %472 = bitcast i32 %471 to float, !dbg !53
486
+ %473 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %469, i32 1, i32 31), !dbg !53
487
+ %474 = fcmp ogt float %468, %472, !dbg !38
488
+ %475 = fcmp oeq float %468, %472, !dbg !50
489
+ %476 = fcmp uno float %468, 0.000000e+00, !dbg !40
490
+ %477 = fcmp uno float %472, 0.000000e+00, !dbg !41
491
+ %478 = xor i1 %477, true, !dbg !42
492
+ %479 = and i1 %476, %478, !dbg !43
493
+ %480 = or i1 %474, %479, !dbg !44
494
+ %481 = and i1 %477, %476, !dbg !51
495
+ %482 = or i1 %475, %481, !dbg !52
496
+ %483 = icmp slt i32 %469, %473, !dbg !45
497
+ %484 = and i1 %483, %482, !dbg !46
498
+ %485 = or i1 %480, %484, !dbg !47
499
+ %486 = select i1 %485, i32 %469, i32 %473, !dbg !49
500
+ %487 = and i32 %9, 995, !dbg !53
501
+ %488 = icmp eq i32 %487, 0, !dbg !53
502
+ %489 = select i1 %485, i32 %470, i32 %471, !dbg !48
503
+ %490 = insertelement <1 x i32> poison, i32 %489, i64 0, !dbg !53
504
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %448, <1 x i32> %490, i1 %488) #4, !dbg !53
505
+ %491 = insertelement <1 x i32> poison, i32 %486, i64 0, !dbg !53
506
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %451, <1 x i32> %491, i1 %488) #4, !dbg !53
507
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
508
+ %492 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %433, !dbg !53
509
+ %493 = load i32, ptr addrspace(3) %492, align 16, !dbg !53
510
+ %494 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %440, !dbg !53
511
+ %495 = load i32, ptr addrspace(3) %494, align 16, !dbg !53
512
+ %496 = sext i32 %137 to i64, !dbg !54
513
+ %497 = getelementptr i64, ptr addrspace(1) %1, i64 %496, !dbg !54
514
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55
515
+ %498 = lshr exact i32 %10, 4, !dbg !55
516
+ %499 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %498, !dbg !55
517
+ %500 = insertelement <2 x i32> poison, i32 %493, i64 0, !dbg !55
518
+ %501 = insertelement <2 x i32> %500, i32 %495, i64 1, !dbg !55
519
+ store <2 x i32> %501, ptr addrspace(3) %499, align 8, !dbg !55
520
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55
521
+ %502 = shl nuw nsw i32 %9, 3, !dbg !55
522
+ %503 = and i32 %502, 24, !dbg !55
523
+ %504 = and i32 %9, 4, !dbg !55
524
+ %505 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %503, !dbg !55
525
+ %506 = getelementptr inbounds nuw i8, ptr addrspace(3) %505, i32 %504, !dbg !55
526
+ %507 = load i32, ptr addrspace(3) %506, align 4, !dbg !55
527
+ %508 = sext i32 %507 to i64, !dbg !55
528
+ %509 = and i32 %9, 504, !dbg !55
529
+ %510 = icmp eq i32 %509, 0, !dbg !55
530
+ tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %508, ptr addrspace(1) %497, i1 %510) #4, !dbg !55
531
+ ret void, !dbg !56
532
+ }
533
+
534
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
535
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
536
+
537
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
538
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
539
+
540
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
541
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
542
+
543
+ ; Function Attrs: convergent nocallback nounwind
544
+ declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
545
+
546
+ attributes #0 = { nounwind "nvvm.reqntid"="512" }
547
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
548
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
549
+ attributes #3 = { convergent nocallback nounwind }
550
+ attributes #4 = { nounwind }
551
+
552
+ !llvm.dbg.cu = !{!0}
553
+ !llvm.module.flags = !{!2, !3}
554
+
555
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
556
+ !1 = !DIFile(filename: "cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv")
557
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
558
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
559
+ !4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
560
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
561
+ !6 = !{}
562
+ !7 = !DILocation(line: 23, column: 28, scope: !4)
563
+ !8 = !DILocation(line: 23, column: 33, scope: !4)
564
+ !9 = !DILocation(line: 24, column: 44, scope: !4)
565
+ !10 = !DILocation(line: 24, column: 23, scope: !4)
566
+ !11 = !DILocation(line: 26, column: 37, scope: !4)
567
+ !12 = !DILocation(line: 29, column: 19, scope: !4)
568
+ !13 = !DILocation(line: 33, column: 40, scope: !4)
569
+ !14 = !DILocation(line: 34, column: 31, scope: !4)
570
+ !15 = !DILocation(line: 35, column: 29, scope: !4)
571
+ !16 = !DILocation(line: 39, column: 52, scope: !4)
572
+ !17 = !DILocation(line: 39, column: 34, scope: !4)
573
+ !18 = !DILocation(line: 39, column: 66, scope: !4)
574
+ !19 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !22)
575
+ !20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0)
576
+ !21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
577
+ !22 = !DILocation(line: 42, column: 38, scope: !4)
578
+ !23 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !22)
579
+ !24 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !22)
580
+ !25 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !22)
581
+ !26 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !22)
582
+ !27 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !22)
583
+ !28 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !22)
584
+ !29 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !22)
585
+ !30 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !22)
586
+ !31 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !22)
587
+ !32 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !22)
588
+ !33 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !22)
589
+ !34 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !22)
590
+ !35 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !22)
591
+ !36 = !DILocation(line: 44, column: 46, scope: !4)
592
+ !37 = !DILocation(line: 45, column: 58, scope: !4)
593
+ !38 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !39)
594
+ !39 = !DILocation(line: 46, column: 75, scope: !4)
595
+ !40 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !39)
596
+ !41 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !39)
597
+ !42 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !39)
598
+ !43 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !39)
599
+ !44 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !39)
600
+ !45 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !39)
601
+ !46 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !39)
602
+ !47 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !39)
603
+ !48 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !39)
604
+ !49 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !39)
605
+ !50 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !39)
606
+ !51 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !39)
607
+ !52 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !39)
608
+ !53 = !DILocation(line: 165, column: 42, scope: !20, inlinedAt: !39)
609
+ !54 = !DILocation(line: 48, column: 25, scope: !4)
610
+ !55 = !DILocation(line: 48, column: 36, scope: !4)
611
+ !56 = !DILocation(line: 48, column: 4, scope: !4)
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ptx ADDED
@@ -0,0 +1,1196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1
10
+ .extern .shared .align 16 .b8 global_smem[];
11
+ // @triton_red_fused_argmax_1
12
+ .visible .entry triton_red_fused_argmax_1(
13
+ .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0,
14
+ .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1,
15
+ .param .u32 triton_red_fused_argmax_1_param_2,
16
+ .param .u32 triton_red_fused_argmax_1_param_3,
17
+ .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_4,
18
+ .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_5
19
+ )
20
+ .reqntid 512
21
+ {
22
+ .reg .pred %p<325>;
23
+ .reg .b32 %r<226>;
24
+ .reg .b64 %rd<63>;
25
+ .loc 1 18 0 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:18:0
26
+ $L__func_begin0:
27
+ .loc 1 18 0 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:18:0
28
+
29
+ // %bb.0:
30
+ ld.param.b64 %rd15, [triton_red_fused_argmax_1_param_1];
31
+ ld.param.b64 %rd14, [triton_red_fused_argmax_1_param_0];
32
+ $L__tmp0:
33
+ .loc 1 23 28 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:23:28
34
+ mov.u32 %r27, %ctaid.x;
35
+ .loc 1 23 33 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:23:33
36
+ shl.b32 %r1, %r27, 3;
37
+ .loc 1 24 44 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:24:44
38
+ mov.u32 %r2, %tid.x;
39
+ and.b32 %r3, %r2, 384;
40
+ bfe.u32 %r28, %r2, 7, 2;
41
+ or.b32 %r4, %r28, 4;
42
+ .loc 1 24 23 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:24:23
43
+ or.b32 %r29, %r28, %r1;
44
+ or.b32 %r30, %r4, %r1;
45
+ .loc 1 26 37 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:26:37
46
+ shl.b32 %r31, %r2, 2;
47
+ and.b32 %r32, %r31, 508;
48
+ .loc 1 29 19 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:29:19
49
+ bfe.s32 %r33, %r27, 28, 1;
50
+ shr.u32 %r34, %r33, 21;
51
+ add.s32 %r35, %r29, %r34;
52
+ shr.s32 %r36, %r35, 11;
53
+ add.s32 %r37, %r30, %r34;
54
+ shr.s32 %r38, %r37, 11;
55
+ .loc 1 33 40 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:33:40
56
+ cvt.u64.u32 %rd1, %r32;
57
+ mul.lo.s32 %r39, %r27, 256000;
58
+ mad.lo.s32 %r40, %r38, 224000, %r39;
59
+ mul.lo.s32 %r41, %r28, 32000;
60
+ add.s32 %r42, %r40, %r41;
61
+ add.s32 %r43, %r42, %r32;
62
+ cvt.u64.u32 %rd2, %r43;
63
+ mad.lo.s32 %r44, %r36, 224000, %r39;
64
+ add.s32 %r45, %r44, %r41;
65
+ add.s32 %r46, %r45, %r32;
66
+ cvt.u64.u32 %rd3, %r46;
67
+ mov.b32 %r47, 0fFF800000;
68
+ mov.b64 %rd59, {%r47, %r47};
69
+ mov.b32 %r218, 2147483647;
70
+ mov.b64 %rd58, -512;
71
+ mov.b32 %r219, %r218;
72
+ mov.b64 %rd60, %rd59;
73
+ mov.b32 %r220, %r218;
74
+ mov.b32 %r221, %r218;
75
+ mov.b64 %rd61, %rd59;
76
+ mov.b64 %rd62, %rd59;
77
+ mov.b32 %r222, %r218;
78
+ mov.b32 %r223, %r218;
79
+ mov.b32 %r224, %r218;
80
+ mov.b32 %r225, %r218;
81
+ $L__BB0_1: // =>This Inner Loop Header: Depth=1
82
+ .loc 1 35 29 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:35:29
83
+ add.s64 %rd26, %rd1, %rd58;
84
+ add.s64 %rd27, %rd26, 512;
85
+ setp.lt.u64 %p1, %rd27, 32000;
86
+ .loc 1 39 52 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:52
87
+ add.s64 %rd28, %rd3, %rd58;
88
+ .loc 1 39 34 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:34
89
+ add.s64 %rd29, %rd2, %rd58;
90
+ cvt.u32.u64 %r64, %rd28;
91
+ add.s32 %r65, %r64, 512;
92
+ mad.wide.s32 %rd21, %r65, 4, %rd14;
93
+ cvt.u32.u64 %r66, %rd29;
94
+ add.s32 %r67, %r66, 128512;
95
+ mad.wide.s32 %rd24, %r67, 4, %rd14;
96
+ .loc 1 39 66 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:66
97
+ // begin inline asm
98
+ mov.u64 %rd20, 0x0;
99
+ createpolicy.fractional.L2::evict_first.b64 %rd20, 1.0;
100
+ // end inline asm
101
+ mov.b32 %r52, 0;
102
+ // begin inline asm
103
+ mov.u32 %r48, %r52;
104
+ mov.u32 %r49, %r52;
105
+ mov.u32 %r50, %r52;
106
+ mov.u32 %r51, %r52;
107
+ @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r48, %r49, %r50, %r51 }, [ %rd21 + 0 ], %rd20;
108
+ // end inline asm
109
+ // begin inline asm
110
+ mov.u64 %rd23, 0x0;
111
+ createpolicy.fractional.L2::evict_first.b64 %rd23, 1.0;
112
+ // end inline asm
113
+ // begin inline asm
114
+ mov.u32 %r56, %r52;
115
+ mov.u32 %r57, %r52;
116
+ mov.u32 %r58, %r52;
117
+ mov.u32 %r59, %r52;
118
+ @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r56, %r57, %r58, %r59 }, [ %rd24 + 0 ], %rd23;
119
+ // end inline asm
120
+ $L__tmp1:
121
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
122
+ mov.b64 {%r68, %r69}, %rd59;
123
+ setp.nan.f32 %p3, %r68, %r68;
124
+ setp.nan.f32 %p4, %r69, %r69;
125
+ mov.b64 {%r70, %r71}, %rd61;
126
+ setp.nan.f32 %p5, %r70, %r70;
127
+ setp.nan.f32 %p6, %r71, %r71;
128
+ mov.b64 {%r72, %r73}, %rd62;
129
+ setp.nan.f32 %p7, %r72, %r72;
130
+ setp.nan.f32 %p8, %r73, %r73;
131
+ mov.b64 {%r74, %r75}, %rd60;
132
+ setp.nan.f32 %p9, %r74, %r74;
133
+ setp.nan.f32 %p10, %r75, %r75;
134
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
135
+ cvt.s64.s32 %rd30, %r219;
136
+ cvt.s64.s32 %rd31, %r218;
137
+ cvt.s64.s32 %rd32, %r221;
138
+ cvt.s64.s32 %rd33, %r220;
139
+ $L__tmp2:
140
+ .loc 1 39 66 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:66
141
+ cvt.u64.u32 %rd34, %r48;
142
+ cvt.u64.u32 %rd35, %r49;
143
+ shl.b64 %rd36, %rd35, 32;
144
+ or.b64 %rd37, %rd34, %rd36;
145
+ $L__tmp3:
146
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
147
+ mov.b64 {%r76, %r77}, %rd37;
148
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
149
+ setp.gt.f32 %p11, %r69, %r77;
150
+ setp.gt.f32 %p12, %r68, %r76;
151
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
152
+ setp.eq.f32 %p13, %r68, %r76;
153
+ setp.eq.f32 %p14, %r69, %r77;
154
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
155
+ setp.nan.f32 %p15, %r77, %r77;
156
+ setp.nan.f32 %p16, %r76, %r76;
157
+ setp.num.f32 %p17, %r76, %r76;
158
+ setp.num.f32 %p18, %r77, %r77;
159
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
160
+ and.pred %p19, %p4, %p18;
161
+ and.pred %p20, %p3, %p17;
162
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
163
+ or.pred %p21, %p12, %p20;
164
+ or.pred %p22, %p11, %p19;
165
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
166
+ and.pred %p23, %p3, %p16;
167
+ and.pred %p24, %p4, %p15;
168
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
169
+ or.pred %p25, %p14, %p24;
170
+ or.pred %p26, %p13, %p23;
171
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
172
+ setp.gt.s64 %p27, %rd27, %rd31;
173
+ setp.ge.s64 %p28, %rd27, %rd30;
174
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
175
+ and.pred %p29, %p27, %p26;
176
+ and.pred %p30, %p28, %p25;
177
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
178
+ or.pred %p31, %p22, %p30;
179
+ or.pred %p32, %p21, %p29;
180
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
181
+ selp.f32 %r78, %r68, %r76, %p32;
182
+ selp.f32 %r79, %r69, %r77, %p31;
183
+ cvt.u32.u64 %r80, %rd27;
184
+ cvt.u32.u64 %r81, %rd26;
185
+ add.s32 %r82, %r81, 513;
186
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
187
+ selp.b32 %r83, %r218, %r80, %p32;
188
+ selp.b32 %r84, %r219, %r82, %p31;
189
+ $L__tmp4:
190
+ .loc 1 39 66 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:66
191
+ cvt.u64.u32 %rd38, %r56;
192
+ cvt.u64.u32 %rd39, %r57;
193
+ shl.b64 %rd40, %rd39, 32;
194
+ or.b64 %rd41, %rd38, %rd40;
195
+ $L__tmp5:
196
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
197
+ mov.b64 {%r85, %r86}, %rd41;
198
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
199
+ setp.gt.f32 %p33, %r75, %r86;
200
+ setp.gt.f32 %p34, %r74, %r85;
201
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
202
+ setp.eq.f32 %p35, %r74, %r85;
203
+ setp.eq.f32 %p36, %r75, %r86;
204
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
205
+ setp.nan.f32 %p37, %r86, %r86;
206
+ setp.nan.f32 %p38, %r85, %r85;
207
+ setp.num.f32 %p39, %r85, %r85;
208
+ setp.num.f32 %p40, %r86, %r86;
209
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
210
+ and.pred %p41, %p10, %p40;
211
+ and.pred %p42, %p9, %p39;
212
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
213
+ or.pred %p43, %p34, %p42;
214
+ or.pred %p44, %p33, %p41;
215
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
216
+ and.pred %p45, %p9, %p38;
217
+ and.pred %p46, %p10, %p37;
218
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
219
+ or.pred %p47, %p36, %p46;
220
+ or.pred %p48, %p35, %p45;
221
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
222
+ setp.gt.s64 %p49, %rd27, %rd33;
223
+ setp.ge.s64 %p50, %rd27, %rd32;
224
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
225
+ and.pred %p51, %p49, %p48;
226
+ and.pred %p52, %p50, %p47;
227
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
228
+ or.pred %p53, %p44, %p52;
229
+ or.pred %p54, %p43, %p51;
230
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
231
+ selp.f32 %r87, %r74, %r85, %p54;
232
+ selp.f32 %r88, %r75, %r86, %p53;
233
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
234
+ selp.b32 %r89, %r220, %r80, %p54;
235
+ selp.b32 %r90, %r221, %r82, %p53;
236
+ $L__tmp6:
237
+ .loc 1 34 31 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:34:31
238
+ add.s64 %rd42, %rd26, 515;
239
+ add.s64 %rd43, %rd26, 514;
240
+ .loc 1 39 66 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:66
241
+ cvt.u64.u32 %rd44, %r59;
242
+ cvt.u64.u32 %rd45, %r58;
243
+ shl.b64 %rd46, %rd45, 32;
244
+ or.b64 %rd47, %rd44, %rd46;
245
+ cvt.u64.u32 %rd48, %r51;
246
+ cvt.u64.u32 %rd49, %r50;
247
+ shl.b64 %rd50, %rd49, 32;
248
+ or.b64 %rd51, %rd48, %rd50;
249
+ $L__tmp7:
250
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
251
+ mov.b64 {%r91, %r92}, %rd51;
252
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
253
+ setp.gt.f32 %p55, %r73, %r92;
254
+ setp.gt.f32 %p56, %r72, %r91;
255
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
256
+ mov.b64 {%r93, %r94}, %rd47;
257
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
258
+ setp.gt.f32 %p57, %r71, %r94;
259
+ setp.gt.f32 %p58, %r70, %r93;
260
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
261
+ setp.eq.f32 %p59, %r70, %r93;
262
+ setp.eq.f32 %p60, %r71, %r94;
263
+ setp.eq.f32 %p61, %r72, %r91;
264
+ setp.eq.f32 %p62, %r73, %r92;
265
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
266
+ setp.nan.f32 %p63, %r92, %r92;
267
+ setp.nan.f32 %p64, %r91, %r91;
268
+ setp.nan.f32 %p65, %r94, %r94;
269
+ setp.nan.f32 %p66, %r93, %r93;
270
+ setp.num.f32 %p67, %r93, %r93;
271
+ setp.num.f32 %p68, %r94, %r94;
272
+ setp.num.f32 %p69, %r91, %r91;
273
+ setp.num.f32 %p70, %r92, %r92;
274
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
275
+ and.pred %p71, %p8, %p70;
276
+ and.pred %p72, %p7, %p69;
277
+ and.pred %p73, %p6, %p68;
278
+ and.pred %p74, %p5, %p67;
279
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
280
+ or.pred %p75, %p58, %p74;
281
+ or.pred %p76, %p57, %p73;
282
+ or.pred %p77, %p56, %p72;
283
+ or.pred %p78, %p55, %p71;
284
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
285
+ and.pred %p79, %p5, %p66;
286
+ and.pred %p80, %p6, %p65;
287
+ and.pred %p81, %p7, %p64;
288
+ and.pred %p82, %p8, %p63;
289
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
290
+ or.pred %p83, %p62, %p82;
291
+ or.pred %p84, %p61, %p81;
292
+ or.pred %p85, %p60, %p80;
293
+ or.pred %p86, %p59, %p79;
294
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
295
+ cvt.s64.s32 %rd52, %r222;
296
+ cvt.s64.s32 %rd53, %r223;
297
+ cvt.s64.s32 %rd54, %r224;
298
+ cvt.s64.s32 %rd55, %r225;
299
+ setp.gt.s64 %p87, %rd43, %rd55;
300
+ setp.gt.s64 %p88, %rd42, %rd54;
301
+ setp.gt.s64 %p89, %rd43, %rd53;
302
+ setp.gt.s64 %p90, %rd42, %rd52;
303
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
304
+ and.pred %p91, %p90, %p86;
305
+ and.pred %p92, %p89, %p85;
306
+ and.pred %p93, %p88, %p84;
307
+ and.pred %p94, %p87, %p83;
308
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
309
+ or.pred %p95, %p78, %p94;
310
+ or.pred %p96, %p77, %p93;
311
+ or.pred %p97, %p76, %p92;
312
+ or.pred %p98, %p75, %p91;
313
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
314
+ selp.f32 %r95, %r70, %r93, %p98;
315
+ selp.f32 %r96, %r71, %r94, %p97;
316
+ selp.f32 %r97, %r72, %r91, %p96;
317
+ selp.f32 %r98, %r73, %r92, %p95;
318
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
319
+ cvt.u32.u64 %r99, %rd43;
320
+ cvt.u32.u64 %r100, %rd42;
321
+ selp.b32 %r101, %r222, %r100, %p98;
322
+ selp.b32 %r102, %r223, %r99, %p97;
323
+ selp.b32 %r103, %r224, %r100, %p96;
324
+ selp.b32 %r104, %r225, %r99, %p95;
325
+ $L__tmp8:
326
+ .loc 1 44 46 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:44:46
327
+ selp.f32 %r105, %r79, %r69, %p1;
328
+ selp.f32 %r106, %r78, %r68, %p1;
329
+ mov.b64 %rd59, {%r106, %r105};
330
+ selp.f32 %r107, %r88, %r75, %p1;
331
+ selp.f32 %r108, %r87, %r74, %p1;
332
+ mov.b64 %rd60, {%r108, %r107};
333
+ selp.f32 %r109, %r98, %r73, %p1;
334
+ selp.f32 %r110, %r97, %r72, %p1;
335
+ mov.b64 %rd62, {%r110, %r109};
336
+ selp.f32 %r111, %r96, %r71, %p1;
337
+ selp.f32 %r112, %r95, %r70, %p1;
338
+ mov.b64 %rd61, {%r112, %r111};
339
+ .loc 1 45 58 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:45:58
340
+ selp.b32 %r219, %r84, %r219, %p1;
341
+ selp.b32 %r218, %r83, %r218, %p1;
342
+ selp.b32 %r221, %r90, %r221, %p1;
343
+ selp.b32 %r220, %r89, %r220, %p1;
344
+ selp.b32 %r225, %r104, %r225, %p1;
345
+ selp.b32 %r224, %r103, %r224, %p1;
346
+ selp.b32 %r223, %r102, %r223, %p1;
347
+ selp.b32 %r222, %r101, %r222, %p1;
348
+ .loc 1 33 40 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:33:40
349
+ add.s64 %rd58, %rd58, 512;
350
+ setp.lt.u64 %p99, %rd58, 31488;
351
+ @%p99 bra $L__BB0_1;
352
+ // %bb.2:
353
+ .loc 1 24 44 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:24:44
354
+ and.b32 %r129, %r2, 7;
355
+ .loc 1 24 23 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:24:23
356
+ or.b32 %r130, %r1, %r129;
357
+ .loc 1 24 44 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:24:44
358
+ and.b32 %r131, %r2, 31;
359
+ $L__tmp9:
360
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
361
+ mov.b64 {%r132, %r133}, %rd59;
362
+ setp.gt.f32 %p109, %r132, %r133;
363
+ setp.eq.f32 %p110, %r133, %r132;
364
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
365
+ setp.nan.f32 %p111, %r132, %r132;
366
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
367
+ setp.num.f32 %p112, %r133, %r133;
368
+ setp.nan.f32 %p113, %r133, %r133;
369
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
370
+ and.pred %p114, %p111, %p113;
371
+ and.pred %p115, %p111, %p112;
372
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
373
+ or.pred %p116, %p109, %p115;
374
+ or.pred %p117, %p110, %p114;
375
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
376
+ setp.lt.s32 %p118, %r218, %r219;
377
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
378
+ and.pred %p119, %p118, %p117;
379
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
380
+ or.pred %p120, %p116, %p119;
381
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
382
+ selp.f32 %r134, %r132, %r133, %p120;
383
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
384
+ selp.b32 %r135, %r218, %r219, %p120;
385
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
386
+ mov.b64 {%r136, %r137}, %rd62;
387
+ setp.gt.f32 %p121, %r134, %r137;
388
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
389
+ setp.eq.f32 %p122, %r134, %r137;
390
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
391
+ setp.nan.f32 %p123, %r134, %r134;
392
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
393
+ mov.b64 {%r138, %r139}, %rd61;
394
+ setp.nan.f32 %p124, %r138, %r138;
395
+ setp.num.f32 %p125, %r138, %r138;
396
+ setp.nan.f32 %p126, %r139, %r139;
397
+ setp.num.f32 %p127, %r139, %r139;
398
+ setp.nan.f32 %p128, %r136, %r136;
399
+ setp.num.f32 %p129, %r136, %r136;
400
+ setp.nan.f32 %p130, %r137, %r137;
401
+ setp.num.f32 %p131, %r137, %r137;
402
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
403
+ and.pred %p132, %p123, %p131;
404
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
405
+ or.pred %p133, %p121, %p132;
406
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
407
+ and.pred %p134, %p130, %p123;
408
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
409
+ or.pred %p135, %p122, %p134;
410
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
411
+ setp.lt.s32 %p136, %r135, %r225;
412
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
413
+ and.pred %p137, %p136, %p135;
414
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
415
+ or.pred %p138, %p133, %p137;
416
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
417
+ selp.f32 %r140, %r134, %r137, %p138;
418
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
419
+ selp.b32 %r141, %r135, %r225, %p138;
420
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
421
+ setp.gt.f32 %p139, %r140, %r136;
422
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
423
+ setp.eq.f32 %p140, %r140, %r136;
424
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
425
+ setp.nan.f32 %p141, %r140, %r140;
426
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
427
+ and.pred %p142, %p141, %p129;
428
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
429
+ or.pred %p143, %p139, %p142;
430
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
431
+ and.pred %p144, %p128, %p141;
432
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
433
+ or.pred %p145, %p140, %p144;
434
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
435
+ setp.lt.s32 %p146, %r141, %r224;
436
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
437
+ and.pred %p147, %p146, %p145;
438
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
439
+ or.pred %p148, %p143, %p147;
440
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
441
+ selp.f32 %r142, %r140, %r136, %p148;
442
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
443
+ selp.b32 %r143, %r141, %r224, %p148;
444
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
445
+ mov.b64 {%r144, %r145}, %rd60;
446
+ setp.gt.f32 %p149, %r144, %r145;
447
+ setp.eq.f32 %p150, %r145, %r144;
448
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
449
+ setp.nan.f32 %p151, %r144, %r144;
450
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
451
+ setp.num.f32 %p152, %r145, %r145;
452
+ setp.nan.f32 %p153, %r145, %r145;
453
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
454
+ and.pred %p154, %p151, %p153;
455
+ and.pred %p155, %p151, %p152;
456
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
457
+ or.pred %p156, %p149, %p155;
458
+ or.pred %p157, %p150, %p154;
459
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
460
+ setp.lt.s32 %p158, %r220, %r221;
461
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
462
+ and.pred %p159, %p158, %p157;
463
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
464
+ or.pred %p160, %p156, %p159;
465
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
466
+ selp.f32 %r146, %r144, %r145, %p160;
467
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
468
+ selp.b32 %r147, %r220, %r221, %p160;
469
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
470
+ setp.gt.f32 %p161, %r146, %r139;
471
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
472
+ setp.eq.f32 %p162, %r146, %r139;
473
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
474
+ setp.nan.f32 %p163, %r146, %r146;
475
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
476
+ and.pred %p164, %p163, %p127;
477
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
478
+ or.pred %p165, %p161, %p164;
479
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
480
+ and.pred %p166, %p126, %p163;
481
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
482
+ or.pred %p167, %p162, %p166;
483
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
484
+ setp.lt.s32 %p168, %r147, %r223;
485
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
486
+ and.pred %p169, %p168, %p167;
487
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
488
+ or.pred %p170, %p165, %p169;
489
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
490
+ selp.f32 %r148, %r146, %r139, %p170;
491
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
492
+ selp.b32 %r149, %r147, %r223, %p170;
493
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
494
+ setp.gt.f32 %p171, %r148, %r138;
495
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
496
+ setp.eq.f32 %p172, %r148, %r138;
497
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
498
+ setp.nan.f32 %p173, %r148, %r148;
499
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
500
+ and.pred %p174, %p173, %p125;
501
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
502
+ or.pred %p175, %p171, %p174;
503
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
504
+ and.pred %p176, %p124, %p173;
505
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
506
+ or.pred %p177, %p172, %p176;
507
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
508
+ setp.lt.s32 %p178, %r149, %r222;
509
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
510
+ and.pred %p179, %p178, %p177;
511
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
512
+ or.pred %p180, %p175, %p179;
513
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
514
+ selp.f32 %r150, %r148, %r138, %p180;
515
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
516
+ selp.b32 %r151, %r149, %r222, %p180;
517
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
518
+ shfl.sync.bfly.b32 %r152, %r142, 16, 31, -1;
519
+ shfl.sync.bfly.b32 %r153, %r143, 16, 31, -1;
520
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
521
+ setp.gt.f32 %p181, %r142, %r152;
522
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
523
+ setp.eq.f32 %p182, %r142, %r152;
524
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
525
+ setp.nan.f32 %p183, %r142, %r142;
526
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
527
+ setp.nan.f32 %p184, %r152, %r152;
528
+ setp.num.f32 %p185, %r152, %r152;
529
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
530
+ and.pred %p186, %p183, %p185;
531
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
532
+ or.pred %p187, %p181, %p186;
533
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
534
+ and.pred %p188, %p183, %p184;
535
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
536
+ or.pred %p189, %p182, %p188;
537
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
538
+ setp.lt.s32 %p190, %r143, %r153;
539
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
540
+ and.pred %p191, %p190, %p189;
541
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
542
+ or.pred %p192, %p187, %p191;
543
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
544
+ selp.f32 %r154, %r142, %r152, %p192;
545
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
546
+ selp.b32 %r155, %r143, %r153, %p192;
547
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
548
+ shfl.sync.bfly.b32 %r156, %r154, 8, 31, -1;
549
+ shfl.sync.bfly.b32 %r157, %r155, 8, 31, -1;
550
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
551
+ setp.gt.f32 %p193, %r154, %r156;
552
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
553
+ setp.eq.f32 %p194, %r154, %r156;
554
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
555
+ setp.nan.f32 %p195, %r154, %r154;
556
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
557
+ setp.nan.f32 %p196, %r156, %r156;
558
+ setp.num.f32 %p197, %r156, %r156;
559
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
560
+ and.pred %p198, %p195, %p197;
561
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
562
+ or.pred %p199, %p193, %p198;
563
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
564
+ and.pred %p200, %p196, %p195;
565
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
566
+ or.pred %p201, %p194, %p200;
567
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
568
+ setp.lt.s32 %p202, %r155, %r157;
569
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
570
+ and.pred %p203, %p202, %p201;
571
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
572
+ or.pred %p204, %p199, %p203;
573
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
574
+ selp.f32 %r158, %r154, %r156, %p204;
575
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
576
+ selp.b32 %r159, %r155, %r157, %p204;
577
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
578
+ shfl.sync.bfly.b32 %r160, %r158, 4, 31, -1;
579
+ shfl.sync.bfly.b32 %r161, %r159, 4, 31, -1;
580
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
581
+ setp.gt.f32 %p205, %r158, %r160;
582
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
583
+ setp.eq.f32 %p206, %r158, %r160;
584
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
585
+ setp.nan.f32 %p207, %r158, %r158;
586
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
587
+ setp.nan.f32 %p208, %r160, %r160;
588
+ setp.num.f32 %p209, %r160, %r160;
589
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
590
+ and.pred %p210, %p207, %p209;
591
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
592
+ or.pred %p211, %p205, %p210;
593
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
594
+ and.pred %p212, %p208, %p207;
595
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
596
+ or.pred %p213, %p206, %p212;
597
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
598
+ setp.lt.s32 %p214, %r159, %r161;
599
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
600
+ and.pred %p215, %p214, %p213;
601
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
602
+ or.pred %p216, %p211, %p215;
603
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
604
+ selp.f32 %r162, %r158, %r160, %p216;
605
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
606
+ selp.b32 %r163, %r159, %r161, %p216;
607
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
608
+ shfl.sync.bfly.b32 %r164, %r162, 2, 31, -1;
609
+ shfl.sync.bfly.b32 %r165, %r163, 2, 31, -1;
610
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
611
+ setp.gt.f32 %p217, %r162, %r164;
612
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
613
+ setp.eq.f32 %p218, %r162, %r164;
614
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
615
+ setp.nan.f32 %p219, %r162, %r162;
616
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
617
+ setp.nan.f32 %p220, %r164, %r164;
618
+ setp.num.f32 %p221, %r164, %r164;
619
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
620
+ and.pred %p222, %p219, %p221;
621
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
622
+ or.pred %p223, %p217, %p222;
623
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
624
+ and.pred %p224, %p220, %p219;
625
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
626
+ or.pred %p225, %p218, %p224;
627
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
628
+ setp.lt.s32 %p226, %r163, %r165;
629
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
630
+ and.pred %p227, %p226, %p225;
631
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
632
+ or.pred %p228, %p223, %p227;
633
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
634
+ selp.f32 %r166, %r162, %r164, %p228;
635
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
636
+ selp.b32 %r167, %r163, %r165, %p228;
637
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
638
+ shfl.sync.bfly.b32 %r168, %r166, 1, 31, -1;
639
+ shfl.sync.bfly.b32 %r169, %r167, 1, 31, -1;
640
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
641
+ setp.gt.f32 %p229, %r166, %r168;
642
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
643
+ setp.eq.f32 %p230, %r166, %r168;
644
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
645
+ setp.nan.f32 %p231, %r166, %r166;
646
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
647
+ setp.nan.f32 %p232, %r168, %r168;
648
+ setp.num.f32 %p233, %r168, %r168;
649
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
650
+ and.pred %p234, %p231, %p233;
651
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
652
+ or.pred %p235, %p229, %p234;
653
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
654
+ and.pred %p236, %p232, %p231;
655
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
656
+ or.pred %p237, %p230, %p236;
657
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
658
+ setp.lt.s32 %p238, %r167, %r169;
659
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
660
+ and.pred %p239, %p238, %p237;
661
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
662
+ or.pred %p240, %p235, %p239;
663
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
664
+ selp.b32 %r116, %r167, %r169, %p240;
665
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
666
+ shfl.sync.bfly.b32 %r170, %r150, 16, 31, -1;
667
+ shfl.sync.bfly.b32 %r171, %r151, 16, 31, -1;
668
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
669
+ setp.gt.f32 %p241, %r150, %r170;
670
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
671
+ setp.eq.f32 %p242, %r150, %r170;
672
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
673
+ setp.nan.f32 %p243, %r150, %r150;
674
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
675
+ setp.nan.f32 %p244, %r170, %r170;
676
+ setp.num.f32 %p245, %r170, %r170;
677
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
678
+ and.pred %p246, %p243, %p245;
679
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
680
+ or.pred %p247, %p241, %p246;
681
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
682
+ and.pred %p248, %p243, %p244;
683
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
684
+ or.pred %p249, %p242, %p248;
685
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
686
+ setp.lt.s32 %p250, %r151, %r171;
687
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
688
+ and.pred %p251, %p250, %p249;
689
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
690
+ or.pred %p252, %p247, %p251;
691
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
692
+ selp.f32 %r172, %r150, %r170, %p252;
693
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
694
+ selp.b32 %r173, %r151, %r171, %p252;
695
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
696
+ shfl.sync.bfly.b32 %r174, %r172, 8, 31, -1;
697
+ shfl.sync.bfly.b32 %r175, %r173, 8, 31, -1;
698
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
699
+ setp.gt.f32 %p253, %r172, %r174;
700
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
701
+ setp.eq.f32 %p254, %r172, %r174;
702
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
703
+ setp.nan.f32 %p255, %r172, %r172;
704
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
705
+ setp.nan.f32 %p256, %r174, %r174;
706
+ setp.num.f32 %p257, %r174, %r174;
707
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
708
+ and.pred %p258, %p255, %p257;
709
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
710
+ or.pred %p259, %p253, %p258;
711
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
712
+ and.pred %p260, %p256, %p255;
713
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
714
+ or.pred %p261, %p254, %p260;
715
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
716
+ setp.lt.s32 %p262, %r173, %r175;
717
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
718
+ and.pred %p263, %p262, %p261;
719
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
720
+ or.pred %p264, %p259, %p263;
721
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
722
+ selp.f32 %r176, %r172, %r174, %p264;
723
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
724
+ selp.b32 %r177, %r173, %r175, %p264;
725
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
726
+ shfl.sync.bfly.b32 %r178, %r176, 4, 31, -1;
727
+ shfl.sync.bfly.b32 %r179, %r177, 4, 31, -1;
728
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
729
+ setp.gt.f32 %p265, %r176, %r178;
730
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
731
+ setp.eq.f32 %p266, %r176, %r178;
732
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
733
+ setp.nan.f32 %p267, %r176, %r176;
734
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
735
+ setp.nan.f32 %p268, %r178, %r178;
736
+ setp.num.f32 %p269, %r178, %r178;
737
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
738
+ and.pred %p270, %p267, %p269;
739
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
740
+ or.pred %p271, %p265, %p270;
741
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
742
+ and.pred %p272, %p268, %p267;
743
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
744
+ or.pred %p273, %p266, %p272;
745
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
746
+ setp.lt.s32 %p274, %r177, %r179;
747
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
748
+ and.pred %p275, %p274, %p273;
749
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
750
+ or.pred %p276, %p271, %p275;
751
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
752
+ selp.f32 %r180, %r176, %r178, %p276;
753
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
754
+ selp.b32 %r181, %r177, %r179, %p276;
755
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
756
+ shfl.sync.bfly.b32 %r182, %r180, 2, 31, -1;
757
+ shfl.sync.bfly.b32 %r183, %r181, 2, 31, -1;
758
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
759
+ setp.gt.f32 %p277, %r180, %r182;
760
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
761
+ setp.eq.f32 %p278, %r180, %r182;
762
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
763
+ setp.nan.f32 %p279, %r180, %r180;
764
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
765
+ setp.nan.f32 %p280, %r182, %r182;
766
+ setp.num.f32 %p281, %r182, %r182;
767
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
768
+ and.pred %p282, %p279, %p281;
769
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
770
+ or.pred %p283, %p277, %p282;
771
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
772
+ and.pred %p284, %p280, %p279;
773
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
774
+ or.pred %p285, %p278, %p284;
775
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
776
+ setp.lt.s32 %p286, %r181, %r183;
777
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
778
+ and.pred %p287, %p286, %p285;
779
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
780
+ or.pred %p288, %p283, %p287;
781
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
782
+ selp.f32 %r184, %r180, %r182, %p288;
783
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
784
+ selp.b32 %r185, %r181, %r183, %p288;
785
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
786
+ shfl.sync.bfly.b32 %r186, %r184, 1, 31, -1;
787
+ shfl.sync.bfly.b32 %r187, %r185, 1, 31, -1;
788
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
789
+ setp.gt.f32 %p289, %r184, %r186;
790
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
791
+ setp.eq.f32 %p290, %r184, %r186;
792
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
793
+ setp.nan.f32 %p291, %r184, %r184;
794
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
795
+ setp.nan.f32 %p292, %r186, %r186;
796
+ setp.num.f32 %p293, %r186, %r186;
797
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
798
+ and.pred %p294, %p291, %p293;
799
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
800
+ or.pred %p295, %p289, %p294;
801
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
802
+ and.pred %p296, %p292, %p291;
803
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
804
+ or.pred %p297, %p290, %p296;
805
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
806
+ setp.lt.s32 %p298, %r185, %r187;
807
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
808
+ and.pred %p299, %p298, %p297;
809
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
810
+ or.pred %p300, %p295, %p299;
811
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
812
+ selp.b32 %r120, %r185, %r187, %p300;
813
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
814
+ bfe.u32 %r188, %r2, 5, 2;
815
+ setp.eq.b32 %p100, %r131, 0;
816
+ shr.u32 %r189, %r3, 5;
817
+ or.b32 %r190, %r189, %r188;
818
+ shl.b32 %r191, %r190, 2;
819
+ mov.b32 %r192, global_smem;
820
+ add.s32 %r113, %r192, %r191;
821
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
822
+ selp.b32 %r114, %r166, %r168, %p240;
823
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
824
+ // begin inline asm
825
+ @%p100 st.shared.b32 [ %r113 + 0 ], %r114;
826
+ // end inline asm
827
+ add.s32 %r193, %r192, 128;
828
+ add.s32 %r115, %r193, %r191;
829
+ // begin inline asm
830
+ @%p100 st.shared.b32 [ %r115 + 0 ], %r116;
831
+ // end inline asm
832
+ shl.b32 %r194, %r188, 2;
833
+ shl.b32 %r195, %r4, 4;
834
+ or.b32 %r196, %r195, %r194;
835
+ add.s32 %r117, %r192, %r196;
836
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
837
+ selp.b32 %r118, %r184, %r186, %p300;
838
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
839
+ // begin inline asm
840
+ @%p100 st.shared.b32 [ %r117 + 0 ], %r118;
841
+ // end inline asm
842
+ add.s32 %r119, %r193, %r196;
843
+ // begin inline asm
844
+ @%p100 st.shared.b32 [ %r119 + 0 ], %r120;
845
+ // end inline asm
846
+ bar.sync 0;
847
+ setp.lt.u32 %p104, %r2, 32;
848
+ add.s32 %r122, %r192, %r31;
849
+ // begin inline asm
850
+ @%p104 ld.shared.b32 %r121, [ %r122 + 0 ];
851
+ // end inline asm
852
+ add.s32 %r124, %r193, %r31;
853
+ // begin inline asm
854
+ @%p104 ld.shared.b32 %r123, [ %r124 + 0 ];
855
+ // end inline asm
856
+ shfl.sync.bfly.b32 %r198, %r121, 2, 31, -1;
857
+ shfl.sync.bfly.b32 %r199, %r123, 2, 31, -1;
858
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
859
+ setp.gt.f32 %p301, %r121, %r198;
860
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
861
+ setp.eq.f32 %p302, %r121, %r198;
862
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
863
+ setp.nan.f32 %p303, %r121, %r121;
864
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
865
+ setp.nan.f32 %p304, %r198, %r198;
866
+ setp.num.f32 %p305, %r198, %r198;
867
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
868
+ and.pred %p306, %p303, %p305;
869
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
870
+ or.pred %p307, %p301, %p306;
871
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
872
+ and.pred %p308, %p303, %p304;
873
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
874
+ or.pred %p309, %p302, %p308;
875
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
876
+ setp.lt.s32 %p310, %r123, %r199;
877
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
878
+ and.pred %p311, %p310, %p309;
879
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
880
+ or.pred %p312, %p307, %p311;
881
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
882
+ selp.f32 %r200, %r121, %r198, %p312;
883
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
884
+ selp.b32 %r201, %r123, %r199, %p312;
885
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
886
+ shfl.sync.bfly.b32 %r202, %r200, 1, 31, -1;
887
+ shfl.sync.bfly.b32 %r203, %r201, 1, 31, -1;
888
+ .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
889
+ setp.gt.f32 %p313, %r200, %r202;
890
+ .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
891
+ setp.eq.f32 %p314, %r200, %r202;
892
+ .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
893
+ setp.nan.f32 %p315, %r200, %r200;
894
+ .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
895
+ setp.nan.f32 %p316, %r202, %r202;
896
+ setp.num.f32 %p317, %r202, %r202;
897
+ .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
898
+ and.pred %p318, %p315, %p317;
899
+ .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
900
+ or.pred %p319, %p313, %p318;
901
+ .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
902
+ and.pred %p320, %p316, %p315;
903
+ .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
904
+ or.pred %p321, %p314, %p320;
905
+ .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
906
+ setp.lt.s32 %p322, %r201, %r203;
907
+ .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
908
+ and.pred %p323, %p322, %p321;
909
+ .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
910
+ or.pred %p324, %p319, %p323;
911
+ .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
912
+ selp.b32 %r128, %r201, %r203, %p324;
913
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
914
+ and.b32 %r204, %r2, 995;
915
+ setp.eq.b32 %p106, %r204, 0;
916
+ .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
917
+ selp.b32 %r126, %r200, %r202, %p324;
918
+ .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
919
+ // begin inline asm
920
+ @%p106 st.shared.b32 [ %r122 + 0 ], %r126;
921
+ // end inline asm
922
+ // begin inline asm
923
+ @%p106 st.shared.b32 [ %r124 + 0 ], %r128;
924
+ // end inline asm
925
+ bar.sync 0;
926
+ shr.u32 %r205, %r3, 3;
927
+ add.s32 %r206, %r193, %r205;
928
+ ld.shared.b32 %r207, [%r206];
929
+ add.s32 %r208, %r193, %r195;
930
+ ld.shared.b32 %r209, [%r208];
931
+ $L__tmp10:
932
+ .loc 1 48 25 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:48:25
933
+ mad.wide.s32 %rd57, %r130, 8, %rd15;
934
+ .loc 1 48 36 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:48:36
935
+ bar.sync 0;
936
+ shr.u32 %r210, %r3, 4;
937
+ add.s32 %r211, %r192, %r210;
938
+ st.shared.v2.b32 [%r211], {%r207, %r209};
939
+ bar.sync 0;
940
+ shl.b32 %r212, %r2, 3;
941
+ and.b32 %r213, %r212, 24;
942
+ and.b32 %r214, %r2, 4;
943
+ add.s32 %r215, %r192, %r213;
944
+ add.s32 %r216, %r215, %r214;
945
+ ld.shared.s32 %rd56, [%r216];
946
+ and.b32 %r217, %r2, 504;
947
+ setp.eq.b32 %p108, %r217, 0;
948
+ // begin inline asm
949
+ @%p108 st.global.b64 [ %rd57 + 0 ], { %rd56 };
950
+ // end inline asm
951
+ .loc 1 48 4 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:48:4
952
+ ret;
953
+ $L__tmp11:
954
+ $L__func_end0:
955
+ // -- End function
956
+ }
957
+ .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py"
958
+ .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
959
+ .section .debug_abbrev
960
+ {
961
+ .b8 1 // Abbreviation Code
962
+ .b8 17 // DW_TAG_compile_unit
963
+ .b8 1 // DW_CHILDREN_yes
964
+ .b8 37 // DW_AT_producer
965
+ .b8 8 // DW_FORM_string
966
+ .b8 19 // DW_AT_language
967
+ .b8 5 // DW_FORM_data2
968
+ .b8 3 // DW_AT_name
969
+ .b8 8 // DW_FORM_string
970
+ .b8 16 // DW_AT_stmt_list
971
+ .b8 6 // DW_FORM_data4
972
+ .b8 27 // DW_AT_comp_dir
973
+ .b8 8 // DW_FORM_string
974
+ .b8 0 // EOM(1)
975
+ .b8 0 // EOM(2)
976
+ .b8 2 // Abbreviation Code
977
+ .b8 46 // DW_TAG_subprogram
978
+ .b8 0 // DW_CHILDREN_no
979
+ .b8 3 // DW_AT_name
980
+ .b8 8 // DW_FORM_string
981
+ .b8 32 // DW_AT_inline
982
+ .b8 11 // DW_FORM_data1
983
+ .b8 0 // EOM(1)
984
+ .b8 0 // EOM(2)
985
+ .b8 3 // Abbreviation Code
986
+ .b8 46 // DW_TAG_subprogram
987
+ .b8 1 // DW_CHILDREN_yes
988
+ .b8 17 // DW_AT_low_pc
989
+ .b8 1 // DW_FORM_addr
990
+ .b8 18 // DW_AT_high_pc
991
+ .b8 1 // DW_FORM_addr
992
+ .b8 49 // DW_AT_abstract_origin
993
+ .b8 19 // DW_FORM_ref4
994
+ .b8 0 // EOM(1)
995
+ .b8 0 // EOM(2)
996
+ .b8 4 // Abbreviation Code
997
+ .b8 29 // DW_TAG_inlined_subroutine
998
+ .b8 0 // DW_CHILDREN_no
999
+ .b8 49 // DW_AT_abstract_origin
1000
+ .b8 19 // DW_FORM_ref4
1001
+ .b8 17 // DW_AT_low_pc
1002
+ .b8 1 // DW_FORM_addr
1003
+ .b8 18 // DW_AT_high_pc
1004
+ .b8 1 // DW_FORM_addr
1005
+ .b8 88 // DW_AT_call_file
1006
+ .b8 11 // DW_FORM_data1
1007
+ .b8 89 // DW_AT_call_line
1008
+ .b8 11 // DW_FORM_data1
1009
+ .b8 87 // DW_AT_call_column
1010
+ .b8 11 // DW_FORM_data1
1011
+ .b8 0 // EOM(1)
1012
+ .b8 0 // EOM(2)
1013
+ .b8 0 // EOM(3)
1014
+ }
1015
+ .section .debug_info
1016
+ {
1017
+ .b32 234 // Length of Unit
1018
+ .b8 2 // DWARF version number
1019
+ .b8 0
1020
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
1021
+ .b8 8 // Address Size (in bytes)
1022
+ .b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit
1023
+ .b8 116 // DW_AT_producer
1024
+ .b8 114
1025
+ .b8 105
1026
+ .b8 116
1027
+ .b8 111
1028
+ .b8 110
1029
+ .b8 0
1030
+ .b8 2 // DW_AT_language
1031
+ .b8 0
1032
+ .b8 99 // DW_AT_name
1033
+ .b8 102
1034
+ .b8 118
1035
+ .b8 109
1036
+ .b8 54
1037
+ .b8 53
1038
+ .b8 53
1039
+ .b8 106
1040
+ .b8 53
1041
+ .b8 99
1042
+ .b8 109
1043
+ .b8 52
1044
+ .b8 53
1045
+ .b8 50
1046
+ .b8 52
1047
+ .b8 103
1048
+ .b8 109
1049
+ .b8 100
1050
+ .b8 121
1051
+ .b8 104
1052
+ .b8 114
1053
+ .b8 55
1054
+ .b8 121
1055
+ .b8 108
1056
+ .b8 105
1057
+ .b8 54
1058
+ .b8 100
1059
+ .b8 102
1060
+ .b8 102
1061
+ .b8 112
1062
+ .b8 97
1063
+ .b8 107
1064
+ .b8 121
1065
+ .b8 115
1066
+ .b8 117
1067
+ .b8 121
1068
+ .b8 99
1069
+ .b8 117
1070
+ .b8 111
1071
+ .b8 122
1072
+ .b8 107
1073
+ .b8 113
1074
+ .b8 109
1075
+ .b8 121
1076
+ .b8 117
1077
+ .b8 97
1078
+ .b8 111
1079
+ .b8 110
1080
+ .b8 107
1081
+ .b8 119
1082
+ .b8 98
1083
+ .b8 103
1084
+ .b8 46
1085
+ .b8 112
1086
+ .b8 121
1087
+ .b8 0
1088
+ .b32 .debug_line // DW_AT_stmt_list
1089
+ .b8 47 // DW_AT_comp_dir
1090
+ .b8 119
1091
+ .b8 111
1092
+ .b8 114
1093
+ .b8 107
1094
+ .b8 115
1095
+ .b8 112
1096
+ .b8 97
1097
+ .b8 99
1098
+ .b8 101
1099
+ .b8 47
1100
+ .b8 104
1101
+ .b8 97
1102
+ .b8 110
1103
+ .b8 114
1104
+ .b8 117
1105
+ .b8 105
1106
+ .b8 47
1107
+ .b8 83
1108
+ .b8 112
1109
+ .b8 101
1110
+ .b8 99
1111
+ .b8 70
1112
+ .b8 111
1113
+ .b8 114
1114
+ .b8 103
1115
+ .b8 101
1116
+ .b8 45
1117
+ .b8 101
1118
+ .b8 120
1119
+ .b8 116
1120
+ .b8 47
1121
+ .b8 99
1122
+ .b8 97
1123
+ .b8 99
1124
+ .b8 104
1125
+ .b8 101
1126
+ .b8 47
1127
+ .b8 99
1128
+ .b8 111
1129
+ .b8 109
1130
+ .b8 112
1131
+ .b8 105
1132
+ .b8 108
1133
+ .b8 101
1134
+ .b8 100
1135
+ .b8 95
1136
+ .b8 107
1137
+ .b8 101
1138
+ .b8 114
1139
+ .b8 110
1140
+ .b8 101
1141
+ .b8 108
1142
+ .b8 115
1143
+ .b8 47
1144
+ .b8 102
1145
+ .b8 118
1146
+ .b8 0
1147
+ .b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram
1148
+ .b8 116 // DW_AT_name
1149
+ .b8 114
1150
+ .b8 105
1151
+ .b8 116
1152
+ .b8 111
1153
+ .b8 110
1154
+ .b8 95
1155
+ .b8 114
1156
+ .b8 101
1157
+ .b8 100
1158
+ .b8 95
1159
+ .b8 102
1160
+ .b8 117
1161
+ .b8 115
1162
+ .b8 101
1163
+ .b8 100
1164
+ .b8 95
1165
+ .b8 97
1166
+ .b8 114
1167
+ .b8 103
1168
+ .b8 109
1169
+ .b8 97
1170
+ .b8 120
1171
+ .b8 95
1172
+ .b8 49
1173
+ .b8 0
1174
+ .b8 1 // DW_AT_inline
1175
+ .b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram
1176
+ .b64 $L__func_begin0 // DW_AT_low_pc
1177
+ .b64 $L__func_end0 // DW_AT_high_pc
1178
+ .b32 139 // DW_AT_abstract_origin
1179
+ .b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine
1180
+ .b32 139 // DW_AT_abstract_origin
1181
+ .b64 $L__tmp1 // DW_AT_low_pc
1182
+ .b64 $L__tmp8 // DW_AT_high_pc
1183
+ .b8 1 // DW_AT_call_file
1184
+ .b8 42 // DW_AT_call_line
1185
+ .b8 38 // DW_AT_call_column
1186
+ .b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine
1187
+ .b32 139 // DW_AT_abstract_origin
1188
+ .b64 $L__tmp9 // DW_AT_low_pc
1189
+ .b64 $L__tmp10 // DW_AT_high_pc
1190
+ .b8 1 // DW_AT_call_file
1191
+ .b8 46 // DW_AT_call_line
1192
+ .b8 75 // DW_AT_call_column
1193
+ .b8 0 // End Of Children Mark
1194
+ .b8 0 // End Of Children Mark
1195
+ }
1196
+ .section .debug_macinfo { }
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.source ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":18:0)
2
+ #loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0)
3
+ #loc45 = loc(unknown)
4
+ #loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0)
5
+ #loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0)
6
+ #loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0)
7
+ #loc70 = loc("in_ptr0"(#loc))
8
+ #loc71 = loc("out_ptr0"(#loc))
9
+ #loc72 = loc("xnumel"(#loc))
10
+ #loc73 = loc("r0_numel"(#loc))
11
+ #loc100 = loc("a_value"(#loc33))
12
+ #loc101 = loc("a_index"(#loc33))
13
+ #loc102 = loc("b_value"(#loc33))
14
+ #loc103 = loc("b_index"(#loc33))
15
+ #loc116 = loc("x"(#loc53))
16
+ #loc117 = loc("x"(#loc57))
17
+ #loc118 = loc("value"(#loc66))
18
+ #loc119 = loc("index"(#loc66))
19
+ module {
20
+ tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
21
+ %xnumel_0 = arith.constant 16384 : i32 loc(#loc74)
22
+ %r0_numel_1 = arith.constant 32000 : i32 loc(#loc75)
23
+ %xoffset = tt.get_program_id x : i32 loc(#loc76)
24
+ %xoffset_2 = arith.constant 8 : i32 loc(#loc77)
25
+ %xoffset_3 = arith.constant 8 : i32 loc(#loc77)
26
+ %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc77)
27
+ %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc78)
28
+ %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc79)
29
+ %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc80)
30
+ %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc80)
31
+ %xmask = arith.constant true loc(#loc81)
32
+ %xmask_8 = arith.constant dense<true> : tensor<8x512xi1> loc(#loc81)
33
+ %r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc82)
34
+ %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32> -> tensor<1x512xi32> loc(#loc83)
35
+ %x0 = arith.constant 2048 : i32 loc(#loc84)
36
+ %x0_10 = arith.constant 2048 : i32 loc(#loc84)
37
+ %x0_11 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc84)
38
+ %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc84)
39
+ %x1 = arith.constant 2048 : i32 loc(#loc85)
40
+ %x1_13 = arith.constant 2048 : i32 loc(#loc85)
41
+ %x1_14 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc85)
42
+ %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc85)
43
+ %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc86)
44
+ %_tmp2_16 = arith.constant dense<0xFF800000> : tensor<8x512xf32> loc(#loc86)
45
+ %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc87)
46
+ %_tmp2_index_17 = arith.constant dense<2147483647> : tensor<8x512xi32> loc(#loc87)
47
+ %c0_i32 = arith.constant 0 : i32 loc(#loc15)
48
+ %c512_i32 = arith.constant 512 : i32 loc(#loc15)
49
+ %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
50
+ %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15)
51
+ %2 = arith.bitcast %c512_i32 : i32 to i32 loc(#loc15)
52
+ %3 = ub.poison : i32 loc(#loc15)
53
+ %_tmp2_index_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_19 = %_tmp2_16, %_tmp2_index_20 = %_tmp2_index_17) -> (tensor<8x512xf32>, tensor<8x512xi32>) : i32 {
54
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x512xi32> loc(#loc89)
55
+ %r0_index_21 = arith.addi %r0_index, %r0_base_9 : tensor<1x512xi32> loc(#loc89)
56
+ %r0_mask = arith.constant dense<32000> : tensor<1x512xi32> loc(#loc90)
57
+ %r0_mask_22 = arith.cmpi slt, %r0_index_21, %r0_mask : tensor<1x512xi32> loc(#loc90)
58
+ %tmp0 = arith.constant 32000 : i32 loc(#loc91)
59
+ %tmp0_23 = arith.constant 32000 : i32 loc(#loc91)
60
+ %tmp0_24 = arith.constant dense<32000> : tensor<8x1xi32> loc(#loc91)
61
+ %tmp0_25 = arith.muli %tmp0_24, %x0_12 : tensor<8x1xi32> loc(#loc91)
62
+ %tmp0_26 = tt.broadcast %r0_index_21 : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc92)
63
+ %tmp0_27 = tt.broadcast %tmp0_25 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc92)
64
+ %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<8x512xi32> loc(#loc92)
65
+ %tmp0_29 = arith.constant 65760000 : i32 loc(#loc93)
66
+ %tmp0_30 = arith.constant 65760000 : i32 loc(#loc93)
67
+ %tmp0_31 = arith.constant dense<65760000> : tensor<8x1xi32> loc(#loc93)
68
+ %tmp0_32 = arith.muli %tmp0_31, %x1_15 : tensor<8x1xi32> loc(#loc93)
69
+ %tmp0_33 = tt.broadcast %tmp0_32 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc94)
70
+ %tmp0_34 = arith.addi %tmp0_28, %tmp0_33 : tensor<8x512xi32> loc(#loc94)
71
+ %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<8x512x!tt.ptr<f32>> loc(#loc95)
72
+ %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<8x512x!tt.ptr<f32>>, tensor<8x512xi32> loc(#loc95)
73
+ %tmp0_37 = arith.constant 0.000000e+00 : f32 loc(#loc96)
74
+ %tmp0_38 = tt.broadcast %r0_mask_22 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc96)
75
+ %tmp0_39 = arith.constant dense<0.000000e+00> : tensor<8x512xf32> loc(#loc96)
76
+ %tmp0_40 = tt.load %tmp0_36, %tmp0_38, %tmp0_39 evictionPolicy = evict_first : tensor<8x512x!tt.ptr<f32>> loc(#loc96)
77
+ %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S8_512S_i32S8_512S_fp32S8_512S_i32S1_512S__(%_tmp2_19, %_tmp2_index_20, %tmp0_40, %r0_index_21) : (tensor<8x512xf32>, tensor<8x512xi32>, tensor<8x512xf32>, tensor<1x512xi32>) -> (tensor<8x512xf32>, tensor<8x512xi32>) loc(#loc24)
78
+ %_tmp2_41 = tt.broadcast %r0_mask_22 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc97)
79
+ %_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_19 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc97)
80
+ %_tmp2_index_43 = tt.broadcast %r0_mask_22 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc98)
81
+ %_tmp2_index_44 = arith.select %_tmp2_index_43, %8#1, %_tmp2_index_20 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc98)
82
+ scf.yield %_tmp2_42, %_tmp2_index_44 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc27)
83
+ } loc(#loc120)
84
+ %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S8_512S_i32S8_512S__(2,)cconstexpr_1_"(%_tmp2_index_18#0, %_tmp2_index_18#1) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc28)
85
+ %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc99)
86
+ %5 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<8x1x!tt.ptr<i64>> loc(#loc30)
87
+ %6 = tt.addptr %5, %xindex_7 : tensor<8x1x!tt.ptr<i64>>, tensor<8x1xi32> loc(#loc30)
88
+ %7 = arith.extsi %tmp2 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc31)
89
+ tt.store %6, %7 : tensor<8x1x!tt.ptr<i64>> loc(#loc31)
90
+ tt.return loc(#loc32)
91
+ } loc(#loc)
92
+ tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S8_512S_i32S8_512S_fp32S8_512S_i32S1_512S__(%a_value: tensor<8x512xf32> loc("a_value"(#loc33)), %a_index: tensor<8x512xi32> loc("a_index"(#loc33)), %b_value: tensor<8x512xf32> loc("b_value"(#loc33)), %b_index: tensor<1x512xi32> loc("b_index"(#loc33))) -> (tensor<8x512xf32>, tensor<8x512xi32>) attributes {noinline = false} {
93
+ %mask = arith.cmpf ogt, %a_value, %b_value : tensor<8x512xf32> loc(#loc121)
94
+ %equal = arith.cmpf oeq, %a_value, %b_value : tensor<8x512xf32> loc(#loc122)
95
+ %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S8_512S__(%a_value) : (tensor<8x512xf32>) -> i1 loc(#loc36)
96
+ %1:2 = scf.if %0 -> (tensor<8x512xi1>, tensor<8x512xi1>) {
97
+ %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<8x512xf32> loc(#loc106)
98
+ %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<8x512xf32> loc(#loc107)
99
+ %mask_4 = arith.constant true loc(#loc108)
100
+ %mask_5 = arith.constant dense<true> : tensor<8x512xi1> loc(#loc108)
101
+ %mask_6 = arith.xori %b_isnan, %mask_5 : tensor<8x512xi1> loc(#loc108)
102
+ %mask_7 = arith.andi %a_isnan, %mask_6 : tensor<8x512xi1> loc(#loc109)
103
+ %mask_8 = arith.ori %mask, %mask_7 : tensor<8x512xi1> loc(#loc123)
104
+ %equal_9 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1> loc(#loc111)
105
+ %equal_10 = arith.ori %equal, %equal_9 : tensor<8x512xi1> loc(#loc124)
106
+ scf.yield %mask_8, %equal_10 : tensor<8x512xi1>, tensor<8x512xi1> loc(#loc124)
107
+ } else {
108
+ scf.yield %mask, %equal : tensor<8x512xi1>, tensor<8x512xi1> loc(#loc45)
109
+ } loc(#loc37)
110
+ %mask_0 = tt.broadcast %b_index : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc113)
111
+ %mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<8x512xi32> loc(#loc113)
112
+ %mask_2 = arith.andi %1#1, %mask_1 : tensor<8x512xi1> loc(#loc114)
113
+ %mask_3 = arith.ori %1#0, %mask_2 : tensor<8x512xi1> loc(#loc115)
114
+ %2 = arith.select %mask_3, %a_value, %b_value : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc49)
115
+ %3 = tt.broadcast %b_index : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc50)
116
+ %4 = arith.select %mask_3, %a_index, %3 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc50)
117
+ tt.return %2, %4 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc51)
118
+ ^bb1: // no predecessors
119
+ %5 = ub.poison : tensor<8x512xf32> loc(#loc52)
120
+ %6 = ub.poison : tensor<8x512xi32> loc(#loc52)
121
+ tt.return %5, %6 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc52)
122
+ } loc(#loc33)
123
+ tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S8_512S__(%x: tensor<8x512xf32> loc("x"(#loc53))) -> i1 attributes {noinline = false} {
124
+ %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S8_512S__(%x) : (tensor<8x512xf32>) -> tensor<8x512xf32> loc(#loc54)
125
+ %true = arith.constant true loc(#loc55)
126
+ tt.return %true : i1 loc(#loc55)
127
+ ^bb1: // no predecessors
128
+ %1 = ub.poison : i1 loc(#loc56)
129
+ tt.return %1 : i1 loc(#loc56)
130
+ } loc(#loc53)
131
+ tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S8_512S__(%x: tensor<8x512xf32> loc("x"(#loc57))) -> tensor<8x512xf32> attributes {noinline = false} {
132
+ %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58)
133
+ %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59)
134
+ %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc59)
135
+ %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<8x512xf32> loc(#loc59)
136
+ %4 = arith.addf %x, %3 : tensor<8x512xf32> loc(#loc59)
137
+ tt.return %4 : tensor<8x512xf32> loc(#loc60)
138
+ ^bb1: // no predecessors
139
+ %5 = ub.poison : tensor<8x512xf32> loc(#loc61)
140
+ tt.return %5 : tensor<8x512xf32> loc(#loc61)
141
+ } loc(#loc57)
142
+ tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} {
143
+ %false = arith.constant false loc(#loc63)
144
+ %cst = arith.constant dense<false> : tensor<1xi1> loc(#loc63)
145
+ tt.return %cst : tensor<1xi1> loc(#loc64)
146
+ ^bb1: // no predecessors
147
+ %0 = ub.poison : tensor<1xi1> loc(#loc65)
148
+ tt.return %0 : tensor<1xi1> loc(#loc65)
149
+ } loc(#loc62)
150
+ tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S8_512S_i32S8_512S__(2,)cconstexpr_1_"(%value: tensor<8x512xf32> loc("value"(#loc66)), %index: tensor<8x512xi32> loc("index"(#loc66))) -> (tensor<8xf32>, tensor<8xi32>) attributes {noinline = false} {
151
+ %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({
152
+ ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)):
153
+ %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc67)
154
+ tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc67)
155
+ }) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc67)
156
+ tt.return %0#0, %0#1 : tensor<8xf32>, tensor<8xi32> loc(#loc68)
157
+ ^bb1: // no predecessors
158
+ %1 = ub.poison : tensor<8xf32> loc(#loc69)
159
+ %2 = ub.poison : tensor<8xi32> loc(#loc69)
160
+ tt.return %1, %2 : tensor<8xf32>, tensor<8xi32> loc(#loc69)
161
+ } loc(#loc66)
162
+ tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc33)), %a_index: i32 loc("a_index"(#loc33)), %b_value: f32 loc("b_value"(#loc33)), %b_index: i32 loc("b_index"(#loc33))) -> (f32, i32) attributes {noinline = false} {
163
+ %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc121)
164
+ %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc122)
165
+ %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc36)
166
+ %1:2 = scf.if %0 -> (i1, i1) {
167
+ %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc106)
168
+ %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc107)
169
+ %mask_3 = arith.constant true loc(#loc108)
170
+ %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc108)
171
+ %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc109)
172
+ %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc123)
173
+ %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc111)
174
+ %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc124)
175
+ scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc124)
176
+ } else {
177
+ scf.yield %mask, %equal : i1, i1 loc(#loc45)
178
+ } loc(#loc37)
179
+ %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc113)
180
+ %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc114)
181
+ %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc115)
182
+ %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc49)
183
+ %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc50)
184
+ tt.return %2, %3 : f32, i32 loc(#loc51)
185
+ ^bb1: // no predecessors
186
+ %4 = ub.poison : f32 loc(#loc52)
187
+ %5 = ub.poison : i32 loc(#loc52)
188
+ tt.return %4, %5 : f32, i32 loc(#loc52)
189
+ } loc(#loc33)
190
+ tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc53))) -> i1 attributes {noinline = false} {
191
+ %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc54)
192
+ %true = arith.constant true loc(#loc55)
193
+ tt.return %true : i1 loc(#loc55)
194
+ ^bb1: // no predecessors
195
+ %1 = ub.poison : i1 loc(#loc56)
196
+ tt.return %1 : i1 loc(#loc56)
197
+ } loc(#loc53)
198
+ tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc57))) -> tensor<1xf32> attributes {noinline = false} {
199
+ %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58)
200
+ %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59)
201
+ %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc59)
202
+ %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc59)
203
+ tt.return %3 : tensor<1xf32> loc(#loc60)
204
+ ^bb1: // no predecessors
205
+ %4 = ub.poison : tensor<1xf32> loc(#loc61)
206
+ tt.return %4 : tensor<1xf32> loc(#loc61)
207
+ } loc(#loc57)
208
+ } loc(#loc)
209
+ #loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":19:13)
210
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":20:15)
211
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:28)
212
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:33)
213
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:36)
214
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:44)
215
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:23)
216
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":25:46)
217
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:27)
218
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:37)
219
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":28:19)
220
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":29:19)
221
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":30:55)
222
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":31:58)
223
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":33:40)
224
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":34:31)
225
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":35:29)
226
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:47)
227
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:41)
228
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:61)
229
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:52)
230
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:34)
231
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:66)
232
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":42:38)
233
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":44:46)
234
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:58)
235
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:8)
236
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":46:75)
237
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":47:20)
238
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:25)
239
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:36)
240
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:4)
241
+ #loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
242
+ #loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
243
+ #loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19)
244
+ #loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7)
245
+ #loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
246
+ #loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
247
+ #loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
248
+ #loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
249
+ #loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
250
+ #loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
251
+ #loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
252
+ #loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
253
+ #loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
254
+ #loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
255
+ #loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
256
+ #loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
257
+ #loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11)
258
+ #loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4)
259
+ #loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29)
260
+ #loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11)
261
+ #loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4)
262
+ #loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30)
263
+ #loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15)
264
+ #loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11)
265
+ #loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4)
266
+ #loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0)
267
+ #loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
268
+ #loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11)
269
+ #loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4)
270
+ #loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
271
+ #loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11)
272
+ #loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4)
273
+ #loc74 = loc("xnumel"(#loc1))
274
+ #loc75 = loc("r0_numel"(#loc2))
275
+ #loc76 = loc("xoffset"(#loc3))
276
+ #loc77 = loc("xoffset"(#loc4))
277
+ #loc78 = loc("xindex"(#loc5))
278
+ #loc79 = loc("xindex"(#loc6))
279
+ #loc80 = loc("xindex"(#loc7))
280
+ #loc81 = loc("xmask"(#loc8))
281
+ #loc82 = loc("r0_base"(#loc9))
282
+ #loc83 = loc("r0_base"(#loc10))
283
+ #loc84 = loc("x0"(#loc11))
284
+ #loc85 = loc("x1"(#loc12))
285
+ #loc86 = loc("_tmp2"(#loc13))
286
+ #loc87 = loc("_tmp2_index"(#loc14))
287
+ #loc88 = loc("_tmp2"(#loc15))
288
+ #loc89 = loc("r0_index"(#loc16))
289
+ #loc90 = loc("r0_mask"(#loc17))
290
+ #loc91 = loc("tmp0"(#loc18))
291
+ #loc92 = loc("tmp0"(#loc19))
292
+ #loc93 = loc("tmp0"(#loc20))
293
+ #loc94 = loc("tmp0"(#loc21))
294
+ #loc95 = loc("tmp0"(#loc22))
295
+ #loc96 = loc("tmp0"(#loc23))
296
+ #loc97 = loc("_tmp2"(#loc25))
297
+ #loc98 = loc("_tmp2_index"(#loc26))
298
+ #loc99 = loc("tmp2"(#loc29))
299
+ #loc104 = loc("mask"(#loc34))
300
+ #loc105 = loc("equal"(#loc35))
301
+ #loc106 = loc("a_isnan"(#loc38))
302
+ #loc107 = loc("b_isnan"(#loc39))
303
+ #loc108 = loc("mask"(#loc40))
304
+ #loc109 = loc("mask"(#loc41))
305
+ #loc110 = loc("mask"(#loc42))
306
+ #loc111 = loc("equal"(#loc43))
307
+ #loc112 = loc("equal"(#loc44))
308
+ #loc113 = loc("mask"(#loc46))
309
+ #loc114 = loc("mask"(#loc47))
310
+ #loc115 = loc("mask"(#loc48))
311
+ #loc120 = loc("_tmp2_index"(#loc88))
312
+ #loc121 = loc("mask"(#loc104))
313
+ #loc122 = loc("equal"(#loc105))
314
+ #loc123 = loc("mask"(#loc110))
315
+ #loc124 = loc("equal"(#loc112))
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttgir ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 4], order = [1, 0]}>
2
+ #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 16], order = [0, 1]}>
3
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":18:0)
4
+ #loc1 = loc(unknown)
5
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":46:75)
6
+ #loc42 = loc("in_ptr0"(#loc))
7
+ #loc43 = loc("out_ptr0"(#loc))
8
+ #loc44 = loc("xnumel"(#loc))
9
+ #loc45 = loc("r0_numel"(#loc))
10
+ #loc79 = loc(callsite(#loc1 at #loc37))
11
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
12
+ tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
13
+ %cst = arith.constant dense<65760000> : tensor<8x1xi32, #blocked> loc(#loc1)
14
+ %cst_0 = arith.constant dense<32000> : tensor<8x1xi32, #blocked> loc(#loc1)
15
+ %cst_1 = arith.constant dense<32000> : tensor<1x512xi32, #blocked> loc(#loc1)
16
+ %cst_2 = arith.constant dense<2048> : tensor<8x1xi32, #blocked> loc(#loc1)
17
+ %c8_i32 = arith.constant 8 : i32 loc(#loc1)
18
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<8x512xf32, #blocked> loc(#loc1)
19
+ %c0_i32 = arith.constant 0 : i32 loc(#loc1)
20
+ %c32000_i32 = arith.constant 32000 : i32 loc(#loc1)
21
+ %c512_i32 = arith.constant 512 : i32 loc(#loc1)
22
+ %true = arith.constant true loc(#loc1)
23
+ %cst_4 = arith.constant dense<true> : tensor<8x512xi1, #blocked> loc(#loc1)
24
+ %cst_5 = arith.constant dense<2147483647> : tensor<8x512xi32, #blocked> loc(#loc1)
25
+ %cst_6 = arith.constant dense<0xFF800000> : tensor<8x512xf32, #blocked> loc(#loc1)
26
+ %xoffset = tt.get_program_id x : i32 loc(#loc46)
27
+ %xoffset_7 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc47)
28
+ %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48)
29
+ %xindex_8 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48)
30
+ %xindex_9 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc48)
31
+ %xindex_10 = tt.expand_dims %xindex_8 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc48)
32
+ %xindex_11 = tt.splat %xoffset_7 : i32 -> tensor<8x1xi32, #blocked> loc(#loc49)
33
+ %xindex_12 = tt.splat %xoffset_7 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc49)
34
+ %xindex_13 = arith.addi %xindex_11, %xindex_9 : tensor<8x1xi32, #blocked> loc(#loc49)
35
+ %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<8x1xi32, #blocked1> loc(#loc49)
36
+ %r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc50)
37
+ %r0_base_15 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x512xi32, #blocked> loc(#loc50)
38
+ %x0 = arith.remsi %xindex_13, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc51)
39
+ %x1 = arith.divsi %xindex_13, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc52)
40
+ %tmp0 = arith.muli %x0, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc53)
41
+ %tmp0_16 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x512xi32, #blocked> loc(#loc54)
42
+ %tmp0_17 = arith.muli %x1, %cst : tensor<8x1xi32, #blocked> loc(#loc55)
43
+ %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32, #blocked> -> tensor<8x512xi32, #blocked> loc(#loc56)
44
+ %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<8x512x!tt.ptr<f32>, #blocked> loc(#loc57)
45
+ %_tmp2_index:2 = scf.for %_tmp2_index_20 = %c0_i32 to %c32000_i32 step %c512_i32 iter_args(%_tmp2 = %cst_6, %_tmp2_index_21 = %cst_5) -> (tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked>) : i32 {
46
+ %r0_index = tt.splat %_tmp2_index_20 : i32 -> tensor<1x512xi32, #blocked> loc(#loc59)
47
+ %r0_index_22 = arith.addi %r0_index, %r0_base_15 : tensor<1x512xi32, #blocked> loc(#loc59)
48
+ %r0_mask = arith.cmpi slt, %r0_index_22, %cst_1 : tensor<1x512xi32, #blocked> loc(#loc60)
49
+ %tmp0_23 = tt.broadcast %r0_index_22 : tensor<1x512xi32, #blocked> -> tensor<8x512xi32, #blocked> loc(#loc54)
50
+ %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<8x512xi32, #blocked> loc(#loc54)
51
+ %tmp0_25 = arith.addi %tmp0_24, %tmp0_18 : tensor<8x512xi32, #blocked> loc(#loc56)
52
+ %tmp0_26 = tt.addptr %tmp0_19, %tmp0_25 : tensor<8x512x!tt.ptr<f32>, #blocked>, tensor<8x512xi32, #blocked> loc(#loc57)
53
+ %tmp0_27 = tt.broadcast %r0_mask : tensor<1x512xi1, #blocked> -> tensor<8x512xi1, #blocked> loc(#loc61)
54
+ %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_3 evictionPolicy = evict_first : tensor<8x512x!tt.ptr<f32>, #blocked> loc(#loc61)
55
+ %mask = arith.cmpf ogt, %_tmp2, %tmp0_28 : tensor<8x512xf32, #blocked> loc(#loc104)
56
+ %equal = arith.cmpf oeq, %_tmp2, %tmp0_28 : tensor<8x512xf32, #blocked> loc(#loc105)
57
+ %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<8x512xf32, #blocked> loc(#loc84)
58
+ %b_isnan = arith.cmpf une, %tmp0_28, %tmp0_28 : tensor<8x512xf32, #blocked> loc(#loc85)
59
+ %mask_29 = arith.xori %b_isnan, %cst_4 : tensor<8x512xi1, #blocked> loc(#loc86)
60
+ %mask_30 = arith.andi %a_isnan, %mask_29 : tensor<8x512xi1, #blocked> loc(#loc87)
61
+ %mask_31 = arith.ori %mask, %mask_30 : tensor<8x512xi1, #blocked> loc(#loc106)
62
+ %equal_32 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1, #blocked> loc(#loc89)
63
+ %equal_33 = arith.ori %equal, %equal_32 : tensor<8x512xi1, #blocked> loc(#loc107)
64
+ %mask_34 = arith.cmpi slt, %_tmp2_index_21, %tmp0_23 : tensor<8x512xi32, #blocked> loc(#loc91)
65
+ %mask_35 = arith.andi %equal_33, %mask_34 : tensor<8x512xi1, #blocked> loc(#loc92)
66
+ %mask_36 = arith.ori %mask_31, %mask_35 : tensor<8x512xi1, #blocked> loc(#loc93)
67
+ %5 = arith.select %mask_36, %_tmp2, %tmp0_28 : tensor<8x512xi1, #blocked>, tensor<8x512xf32, #blocked> loc(#loc74)
68
+ %6 = arith.select %mask_36, %_tmp2_index_21, %tmp0_23 : tensor<8x512xi1, #blocked>, tensor<8x512xi32, #blocked> loc(#loc75)
69
+ %_tmp2_37 = arith.select %tmp0_27, %5, %_tmp2 : tensor<8x512xi1, #blocked>, tensor<8x512xf32, #blocked> loc(#loc76)
70
+ %_tmp2_index_38 = arith.select %tmp0_27, %6, %_tmp2_index_21 : tensor<8x512xi1, #blocked>, tensor<8x512xi32, #blocked> loc(#loc77)
71
+ scf.yield %_tmp2_37, %_tmp2_index_38 : tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked> loc(#loc35)
72
+ } loc(#loc81)
73
+ %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({
74
+ ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc37)), %arg5: i32 loc(callsite(#loc1 at #loc37)), %arg6: f32 loc(callsite(#loc1 at #loc37)), %arg7: i32 loc(callsite(#loc1 at #loc37))):
75
+ %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc108)
76
+ %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc109)
77
+ %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc94)
78
+ %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc95)
79
+ %mask_20 = arith.xori %b_isnan, %true : i1 loc(#loc96)
80
+ %mask_21 = arith.andi %a_isnan, %mask_20 : i1 loc(#loc97)
81
+ %mask_22 = arith.ori %mask, %mask_21 : i1 loc(#loc110)
82
+ %equal_23 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc98)
83
+ %equal_24 = arith.ori %equal, %equal_23 : i1 loc(#loc111)
84
+ %mask_25 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc99)
85
+ %mask_26 = arith.andi %equal_24, %mask_25 : i1 loc(#loc100)
86
+ %mask_27 = arith.ori %mask_22, %mask_26 : i1 loc(#loc101)
87
+ %5 = arith.select %mask_27, %arg4, %arg6 : f32 loc(#loc102)
88
+ %6 = arith.select %mask_27, %arg5, %arg7 : i32 loc(#loc103)
89
+ tt.reduce.return %5, %6 : f32, i32 loc(#loc78)
90
+ }) : (tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked>) -> (tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc78)
91
+ %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc80)
92
+ %1 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<8x1x!tt.ptr<i64>, #blocked1> loc(#loc39)
93
+ %2 = tt.addptr %1, %xindex_14 : tensor<8x1x!tt.ptr<i64>, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc39)
94
+ %3 = ttg.convert_layout %tmp2 : tensor<8x1xi32, #blocked> -> tensor<8x1xi32, #blocked1> loc(#loc40)
95
+ %4 = arith.extsi %3 : tensor<8x1xi32, #blocked1> to tensor<8x1xi64, #blocked1> loc(#loc40)
96
+ tt.store %2, %4 : tensor<8x1x!tt.ptr<i64>, #blocked1> loc(#loc40)
97
+ tt.return loc(#loc41)
98
+ } loc(#loc)
99
+ } loc(#loc)
100
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:28)
101
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:33)
102
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:44)
103
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:23)
104
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:37)
105
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":28:19)
106
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":29:19)
107
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:47)
108
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:41)
109
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:61)
110
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:52)
111
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:34)
112
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":33:40)
113
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":34:31)
114
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":35:29)
115
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:66)
116
+ #loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
117
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":42:38)
118
+ #loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
119
+ #loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
120
+ #loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
121
+ #loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
122
+ #loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
123
+ #loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
124
+ #loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
125
+ #loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
126
+ #loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
127
+ #loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
128
+ #loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
129
+ #loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
130
+ #loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
131
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":44:46)
132
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:58)
133
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:8)
134
+ #loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
135
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":47:20)
136
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:25)
137
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:36)
138
+ #loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:4)
139
+ #loc46 = loc("xoffset"(#loc2))
140
+ #loc47 = loc("xoffset"(#loc3))
141
+ #loc48 = loc("xindex"(#loc4))
142
+ #loc49 = loc("xindex"(#loc5))
143
+ #loc50 = loc("r0_base"(#loc6))
144
+ #loc51 = loc("x0"(#loc7))
145
+ #loc52 = loc("x1"(#loc8))
146
+ #loc53 = loc("tmp0"(#loc9))
147
+ #loc54 = loc("tmp0"(#loc10))
148
+ #loc55 = loc("tmp0"(#loc11))
149
+ #loc56 = loc("tmp0"(#loc12))
150
+ #loc57 = loc("tmp0"(#loc13))
151
+ #loc58 = loc("_tmp2"(#loc14))
152
+ #loc59 = loc("r0_index"(#loc15))
153
+ #loc60 = loc("r0_mask"(#loc16))
154
+ #loc61 = loc("tmp0"(#loc17))
155
+ #loc62 = loc("mask"(#loc18))
156
+ #loc63 = loc("equal"(#loc20))
157
+ #loc64 = loc("a_isnan"(#loc21))
158
+ #loc65 = loc("b_isnan"(#loc22))
159
+ #loc66 = loc("mask"(#loc23))
160
+ #loc67 = loc("mask"(#loc24))
161
+ #loc68 = loc("mask"(#loc25))
162
+ #loc69 = loc("equal"(#loc26))
163
+ #loc70 = loc("equal"(#loc27))
164
+ #loc71 = loc("mask"(#loc28))
165
+ #loc72 = loc("mask"(#loc29))
166
+ #loc73 = loc("mask"(#loc30))
167
+ #loc74 = loc(callsite(#loc31 at #loc19))
168
+ #loc75 = loc(callsite(#loc32 at #loc19))
169
+ #loc76 = loc("_tmp2"(#loc33))
170
+ #loc77 = loc("_tmp2_index"(#loc34))
171
+ #loc78 = loc(callsite(#loc36 at #loc37))
172
+ #loc80 = loc("tmp2"(#loc38))
173
+ #loc81 = loc("_tmp2_index"(#loc58))
174
+ #loc82 = loc("mask"(#loc62))
175
+ #loc83 = loc("equal"(#loc63))
176
+ #loc84 = loc(callsite(#loc64 at #loc19))
177
+ #loc85 = loc(callsite(#loc65 at #loc19))
178
+ #loc86 = loc(callsite(#loc66 at #loc19))
179
+ #loc87 = loc(callsite(#loc67 at #loc19))
180
+ #loc88 = loc("mask"(#loc68))
181
+ #loc89 = loc(callsite(#loc69 at #loc19))
182
+ #loc90 = loc("equal"(#loc70))
183
+ #loc91 = loc(callsite(#loc71 at #loc19))
184
+ #loc92 = loc(callsite(#loc72 at #loc19))
185
+ #loc93 = loc(callsite(#loc73 at #loc19))
186
+ #loc94 = loc(callsite(#loc64 at #loc78))
187
+ #loc95 = loc(callsite(#loc65 at #loc78))
188
+ #loc96 = loc(callsite(#loc66 at #loc78))
189
+ #loc97 = loc(callsite(#loc67 at #loc78))
190
+ #loc98 = loc(callsite(#loc69 at #loc78))
191
+ #loc99 = loc(callsite(#loc71 at #loc78))
192
+ #loc100 = loc(callsite(#loc72 at #loc78))
193
+ #loc101 = loc(callsite(#loc73 at #loc78))
194
+ #loc102 = loc(callsite(#loc31 at #loc78))
195
+ #loc103 = loc(callsite(#loc32 at #loc78))
196
+ #loc104 = loc(callsite(#loc82 at #loc19))
197
+ #loc105 = loc(callsite(#loc83 at #loc19))
198
+ #loc106 = loc(callsite(#loc88 at #loc19))
199
+ #loc107 = loc(callsite(#loc90 at #loc19))
200
+ #loc108 = loc(callsite(#loc82 at #loc78))
201
+ #loc109 = loc(callsite(#loc83 at #loc78))
202
+ #loc110 = loc(callsite(#loc88 at #loc78))
203
+ #loc111 = loc(callsite(#loc90 at #loc78))
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttir ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":46:75)
4
+ #loc46 = loc("in_ptr0"(#loc))
5
+ #loc47 = loc("out_ptr0"(#loc))
6
+ #loc48 = loc("xnumel"(#loc))
7
+ #loc49 = loc("r0_numel"(#loc))
8
+ #loc50 = loc(callsite(#loc1 at #loc2))
9
+ module {
10
+ tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
11
+ %true = arith.constant true loc(#loc50)
12
+ %cst = arith.constant dense<true> : tensor<8x512xi1> loc(#loc1)
13
+ %c512_i32 = arith.constant 512 : i32 loc(#loc3)
14
+ %c32000_i32 = arith.constant 32000 : i32 loc(#loc3)
15
+ %c0_i32 = arith.constant 0 : i32 loc(#loc3)
16
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x512xf32> loc(#loc1)
17
+ %cst_1 = arith.constant dense<65760000> : tensor<8x1xi32> loc(#loc1)
18
+ %cst_2 = arith.constant dense<32000> : tensor<8x1xi32> loc(#loc1)
19
+ %cst_3 = arith.constant dense<32000> : tensor<1x512xi32> loc(#loc1)
20
+ %_tmp2_index = arith.constant dense<2147483647> : tensor<8x512xi32> loc(#loc51)
21
+ %_tmp2 = arith.constant dense<0xFF800000> : tensor<8x512xf32> loc(#loc52)
22
+ %cst_4 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc1)
23
+ %c8_i32 = arith.constant 8 : i32 loc(#loc1)
24
+ %xoffset = tt.get_program_id x : i32 loc(#loc53)
25
+ %xoffset_5 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc54)
26
+ %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc55)
27
+ %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc56)
28
+ %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32> loc(#loc57)
29
+ %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<8x1xi32> loc(#loc57)
30
+ %r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc58)
31
+ %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32> -> tensor<1x512xi32> loc(#loc59)
32
+ %x0 = arith.remsi %xindex_8, %cst_4 : tensor<8x1xi32> loc(#loc60)
33
+ %x1 = arith.divsi %xindex_8, %cst_4 : tensor<8x1xi32> loc(#loc61)
34
+ %_tmp2_index_10:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c512_i32 iter_args(%_tmp2_11 = %_tmp2, %_tmp2_index_12 = %_tmp2_index) -> (tensor<8x512xf32>, tensor<8x512xi32>) : i32 {
35
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x512xi32> loc(#loc63)
36
+ %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x512xi32> loc(#loc63)
37
+ %r0_mask = arith.cmpi slt, %r0_index_13, %cst_3 : tensor<1x512xi32> loc(#loc64)
38
+ %tmp0 = arith.muli %x0, %cst_2 : tensor<8x1xi32> loc(#loc65)
39
+ %tmp0_14 = tt.broadcast %r0_index_13 : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc66)
40
+ %tmp0_15 = tt.broadcast %tmp0 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc66)
41
+ %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x512xi32> loc(#loc66)
42
+ %tmp0_17 = arith.muli %x1, %cst_1 : tensor<8x1xi32> loc(#loc67)
43
+ %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc68)
44
+ %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<8x512xi32> loc(#loc68)
45
+ %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<8x512x!tt.ptr<f32>> loc(#loc69)
46
+ %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<8x512x!tt.ptr<f32>>, tensor<8x512xi32> loc(#loc69)
47
+ %tmp0_22 = tt.broadcast %r0_mask : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc70)
48
+ %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_0 evictionPolicy = evict_first : tensor<8x512x!tt.ptr<f32>> loc(#loc70)
49
+ %mask = arith.cmpf ogt, %_tmp2_11, %tmp0_23 : tensor<8x512xf32> loc(#loc112)
50
+ %equal = arith.cmpf oeq, %_tmp2_11, %tmp0_23 : tensor<8x512xf32> loc(#loc113)
51
+ %a_isnan = arith.cmpf une, %_tmp2_11, %_tmp2_11 : tensor<8x512xf32> loc(#loc92)
52
+ %b_isnan = arith.cmpf une, %tmp0_23, %tmp0_23 : tensor<8x512xf32> loc(#loc93)
53
+ %mask_24 = arith.xori %b_isnan, %cst : tensor<8x512xi1> loc(#loc94)
54
+ %mask_25 = arith.andi %a_isnan, %mask_24 : tensor<8x512xi1> loc(#loc95)
55
+ %mask_26 = arith.ori %mask, %mask_25 : tensor<8x512xi1> loc(#loc114)
56
+ %equal_27 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1> loc(#loc97)
57
+ %equal_28 = arith.ori %equal, %equal_27 : tensor<8x512xi1> loc(#loc115)
58
+ %mask_29 = arith.cmpi slt, %_tmp2_index_12, %tmp0_14 : tensor<8x512xi32> loc(#loc99)
59
+ %mask_30 = arith.andi %equal_28, %mask_29 : tensor<8x512xi1> loc(#loc100)
60
+ %mask_31 = arith.ori %mask_26, %mask_30 : tensor<8x512xi1> loc(#loc101)
61
+ %4 = arith.select %mask_31, %_tmp2_11, %tmp0_23 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc83)
62
+ %5 = arith.select %mask_31, %_tmp2_index_12, %tmp0_14 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc84)
63
+ %_tmp2_32 = arith.select %tmp0_22, %4, %_tmp2_11 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc85)
64
+ %_tmp2_index_33 = arith.select %tmp0_22, %5, %_tmp2_index_12 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc86)
65
+ scf.yield %_tmp2_32, %_tmp2_index_33 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc40)
66
+ } loc(#loc89)
67
+ %0:2 = "tt.reduce"(%_tmp2_index_10#0, %_tmp2_index_10#1) <{axis = 1 : i32}> ({
68
+ ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))):
69
+ %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc116)
70
+ %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc117)
71
+ %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc102)
72
+ %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc103)
73
+ %mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc104)
74
+ %mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc105)
75
+ %mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc118)
76
+ %equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc106)
77
+ %equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc119)
78
+ %mask_16 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc107)
79
+ %mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc108)
80
+ %mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc109)
81
+ %4 = arith.select %mask_18, %arg4, %arg6 : f32 loc(#loc110)
82
+ %5 = arith.select %mask_18, %arg5, %arg7 : i32 loc(#loc111)
83
+ tt.reduce.return %4, %5 : f32, i32 loc(#loc87)
84
+ }) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc87)
85
+ %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc88)
86
+ %1 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<8x1x!tt.ptr<i64>> loc(#loc43)
87
+ %2 = tt.addptr %1, %xindex_8 : tensor<8x1x!tt.ptr<i64>>, tensor<8x1xi32> loc(#loc43)
88
+ %3 = arith.extsi %tmp2 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc44)
89
+ tt.store %2, %3 : tensor<8x1x!tt.ptr<i64>> loc(#loc44)
90
+ tt.return loc(#loc45)
91
+ } loc(#loc)
92
+ } loc(#loc)
93
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":33:40)
94
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":31:58)
95
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":30:55)
96
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:28)
97
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:33)
98
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:36)
99
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:44)
100
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:23)
101
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:27)
102
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:37)
103
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":28:19)
104
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":29:19)
105
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":34:31)
106
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":35:29)
107
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:47)
108
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:41)
109
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:61)
110
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:52)
111
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:34)
112
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:66)
113
+ #loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
114
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":42:38)
115
+ #loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
116
+ #loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
117
+ #loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
118
+ #loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
119
+ #loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
120
+ #loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
121
+ #loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
122
+ #loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
123
+ #loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
124
+ #loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
125
+ #loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
126
+ #loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
127
+ #loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
128
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":44:46)
129
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:58)
130
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:8)
131
+ #loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
132
+ #loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":47:20)
133
+ #loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:25)
134
+ #loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:36)
135
+ #loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:4)
136
+ #loc51 = loc("_tmp2_index"(#loc4))
137
+ #loc52 = loc("_tmp2"(#loc5))
138
+ #loc53 = loc("xoffset"(#loc6))
139
+ #loc54 = loc("xoffset"(#loc7))
140
+ #loc55 = loc("xindex"(#loc8))
141
+ #loc56 = loc("xindex"(#loc9))
142
+ #loc57 = loc("xindex"(#loc10))
143
+ #loc58 = loc("r0_base"(#loc11))
144
+ #loc59 = loc("r0_base"(#loc12))
145
+ #loc60 = loc("x0"(#loc13))
146
+ #loc61 = loc("x1"(#loc14))
147
+ #loc62 = loc("_tmp2"(#loc3))
148
+ #loc63 = loc("r0_index"(#loc15))
149
+ #loc64 = loc("r0_mask"(#loc16))
150
+ #loc65 = loc("tmp0"(#loc17))
151
+ #loc66 = loc("tmp0"(#loc18))
152
+ #loc67 = loc("tmp0"(#loc19))
153
+ #loc68 = loc("tmp0"(#loc20))
154
+ #loc69 = loc("tmp0"(#loc21))
155
+ #loc70 = loc("tmp0"(#loc22))
156
+ #loc71 = loc("mask"(#loc23))
157
+ #loc72 = loc("equal"(#loc25))
158
+ #loc73 = loc("a_isnan"(#loc26))
159
+ #loc74 = loc("b_isnan"(#loc27))
160
+ #loc75 = loc("mask"(#loc28))
161
+ #loc76 = loc("mask"(#loc29))
162
+ #loc77 = loc("mask"(#loc30))
163
+ #loc78 = loc("equal"(#loc31))
164
+ #loc79 = loc("equal"(#loc32))
165
+ #loc80 = loc("mask"(#loc33))
166
+ #loc81 = loc("mask"(#loc34))
167
+ #loc82 = loc("mask"(#loc35))
168
+ #loc83 = loc(callsite(#loc36 at #loc24))
169
+ #loc84 = loc(callsite(#loc37 at #loc24))
170
+ #loc85 = loc("_tmp2"(#loc38))
171
+ #loc86 = loc("_tmp2_index"(#loc39))
172
+ #loc87 = loc(callsite(#loc41 at #loc2))
173
+ #loc88 = loc("tmp2"(#loc42))
174
+ #loc89 = loc("_tmp2_index"(#loc62))
175
+ #loc90 = loc("mask"(#loc71))
176
+ #loc91 = loc("equal"(#loc72))
177
+ #loc92 = loc(callsite(#loc73 at #loc24))
178
+ #loc93 = loc(callsite(#loc74 at #loc24))
179
+ #loc94 = loc(callsite(#loc75 at #loc24))
180
+ #loc95 = loc(callsite(#loc76 at #loc24))
181
+ #loc96 = loc("mask"(#loc77))
182
+ #loc97 = loc(callsite(#loc78 at #loc24))
183
+ #loc98 = loc("equal"(#loc79))
184
+ #loc99 = loc(callsite(#loc80 at #loc24))
185
+ #loc100 = loc(callsite(#loc81 at #loc24))
186
+ #loc101 = loc(callsite(#loc82 at #loc24))
187
+ #loc102 = loc(callsite(#loc73 at #loc87))
188
+ #loc103 = loc(callsite(#loc74 at #loc87))
189
+ #loc104 = loc(callsite(#loc75 at #loc87))
190
+ #loc105 = loc(callsite(#loc76 at #loc87))
191
+ #loc106 = loc(callsite(#loc78 at #loc87))
192
+ #loc107 = loc(callsite(#loc80 at #loc87))
193
+ #loc108 = loc(callsite(#loc81 at #loc87))
194
+ #loc109 = loc(callsite(#loc82 at #loc87))
195
+ #loc110 = loc(callsite(#loc36 at #loc87))
196
+ #loc111 = loc(callsite(#loc37 at #loc87))
197
+ #loc112 = loc(callsite(#loc90 at #loc24))
198
+ #loc113 = loc(callsite(#loc91 at #loc24))
199
+ #loc114 = loc(callsite(#loc96 at #loc24))
200
+ #loc115 = loc(callsite(#loc98 at #loc24))
201
+ #loc116 = loc(callsite(#loc90 at #loc87))
202
+ #loc117 = loc(callsite(#loc91 at #loc87))
203
+ #loc118 = loc(callsite(#loc96 at #loc87))
204
+ #loc119 = loc(callsite(#loc98 at #loc87))
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/__grp__triton_poi_fused_new_zeros_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_poi_fused_new_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.source", "triton_poi_fused_new_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttir", "triton_poi_fused_new_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttgir", "triton_poi_fused_new_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.llir", "triton_poi_fused_new_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ptx", "triton_poi_fused_new_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.cubin", "triton_poi_fused_new_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.json"}}
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.cubin ADDED
Binary file (5.77 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "e69b538dd589c5b9c913d33cdaa69fd2fa4ad6662ded2e7cc313f4666af16fd4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_new_zeros_0"}
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.llir ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ ; Function Attrs: nounwind
6
+ define ptx_kernel void @triton_poi_fused_new_zeros_0(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) readnone captures(none) %2, ptr addrspace(1) readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 {
7
+ %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
8
+ %6 = shl i32 %5, 8, !dbg !8
9
+ %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
10
+ %8 = shl nuw nsw i32 %7, 1, !dbg !9
11
+ %9 = and i32 %8, 254, !dbg !9
12
+ %10 = or disjoint i32 %9, %6, !dbg !10
13
+ %11 = or disjoint i32 %10, 1, !dbg !10
14
+ %12 = icmp slt i32 %10, %1, !dbg !11
15
+ %13 = icmp slt i32 %11, %1, !dbg !11
16
+ %14 = sext i32 %10 to i64, !dbg !12
17
+ %15 = getelementptr i32, ptr addrspace(1) %0, i64 %14, !dbg !12
18
+ %16 = sext i32 %11 to i64, !dbg !12
19
+ %17 = getelementptr i32, ptr addrspace(1) %0, i64 %16, !dbg !12
20
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 0, ptr addrspace(1) %15, i1 %12) #2, !dbg !13
21
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 0, ptr addrspace(1) %17, i1 %13) #2, !dbg !13
22
+ ret void, !dbg !14
23
+ }
24
+
25
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
26
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
27
+
28
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
29
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
30
+
31
+ attributes #0 = { nounwind "nvvm.reqntid"="128" }
32
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
33
+ attributes #2 = { nounwind }
34
+
35
+ !llvm.dbg.cu = !{!0}
36
+ !llvm.module.flags = !{!2, !3}
37
+
38
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
39
+ !1 = !DIFile(filename: "cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6")
40
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
41
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
42
+ !4 = distinct !DISubprogram(name: "triton_poi_fused_new_zeros_0", linkageName: "triton_poi_fused_new_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
43
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
44
+ !6 = !{}
45
+ !7 = !DILocation(line: 19, column: 28, scope: !4)
46
+ !8 = !DILocation(line: 19, column: 33, scope: !4)
47
+ !9 = !DILocation(line: 20, column: 36, scope: !4)
48
+ !10 = !DILocation(line: 20, column: 23, scope: !4)
49
+ !11 = !DILocation(line: 21, column: 21, scope: !4)
50
+ !12 = !DILocation(line: 24, column: 25, scope: !4)
51
+ !13 = !DILocation(line: 24, column: 36, scope: !4)
52
+ !14 = !DILocation(line: 24, column: 4, scope: !4)
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ptx ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_poi_fused_new_zeros_0 // -- Begin function triton_poi_fused_new_zeros_0
10
+ // @triton_poi_fused_new_zeros_0
11
+ .visible .entry triton_poi_fused_new_zeros_0(
12
+ .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_0,
13
+ .param .u32 triton_poi_fused_new_zeros_0_param_1,
14
+ .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_2,
15
+ .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_3
16
+ )
17
+ .reqntid 128
18
+ {
19
+ .reg .pred %p<3>;
20
+ .reg .b32 %r<11>;
21
+ .reg .b64 %rd<4>;
22
+ .loc 1 18 0 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:18:0
23
+ $L__func_begin0:
24
+ .loc 1 18 0 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:18:0
25
+
26
+ // %bb.0:
27
+ ld.param.b64 %rd3, [triton_poi_fused_new_zeros_0_param_0];
28
+ ld.param.b32 %r3, [triton_poi_fused_new_zeros_0_param_1];
29
+ $L__tmp0:
30
+ .loc 1 19 28 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:19:28
31
+ mov.u32 %r4, %ctaid.x;
32
+ .loc 1 19 33 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:19:33
33
+ shl.b32 %r5, %r4, 8;
34
+ .loc 1 20 36 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:20:36
35
+ mov.u32 %r6, %tid.x;
36
+ shl.b32 %r7, %r6, 1;
37
+ and.b32 %r8, %r7, 254;
38
+ .loc 1 20 23 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:20:23
39
+ or.b32 %r9, %r8, %r5;
40
+ or.b32 %r10, %r9, 1;
41
+ .loc 1 21 21 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:21:21
42
+ setp.lt.s32 %p1, %r9, %r3;
43
+ setp.lt.s32 %p2, %r10, %r3;
44
+ .loc 1 24 25 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:24:25
45
+ mad.wide.s32 %rd1, %r9, 4, %rd3;
46
+ add.s64 %rd2, %rd1, 4;
47
+ mov.b32 %r1, 0;
48
+ .loc 1 24 36 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:24:36
49
+ // begin inline asm
50
+ @%p1 st.global.b32 [ %rd1 + 0 ], { %r1 };
51
+ // end inline asm
52
+ // begin inline asm
53
+ @%p2 st.global.b32 [ %rd2 + 0 ], { %r1 };
54
+ // end inline asm
55
+ .loc 1 24 4 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:24:4
56
+ ret;
57
+ $L__tmp1:
58
+ $L__func_end0:
59
+ // -- End function
60
+ }
61
+ .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py"
62
+ .section .debug_abbrev
63
+ {
64
+ .b8 1 // Abbreviation Code
65
+ .b8 17 // DW_TAG_compile_unit
66
+ .b8 0 // DW_CHILDREN_no
67
+ .b8 37 // DW_AT_producer
68
+ .b8 8 // DW_FORM_string
69
+ .b8 19 // DW_AT_language
70
+ .b8 5 // DW_FORM_data2
71
+ .b8 3 // DW_AT_name
72
+ .b8 8 // DW_FORM_string
73
+ .b8 16 // DW_AT_stmt_list
74
+ .b8 6 // DW_FORM_data4
75
+ .b8 27 // DW_AT_comp_dir
76
+ .b8 8 // DW_FORM_string
77
+ .b8 0 // EOM(1)
78
+ .b8 0 // EOM(2)
79
+ .b8 0 // EOM(3)
80
+ }
81
+ .section .debug_info
82
+ {
83
+ .b32 135 // Length of Unit
84
+ .b8 2 // DWARF version number
85
+ .b8 0
86
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
87
+ .b8 8 // Address Size (in bytes)
88
+ .b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit
89
+ .b8 116 // DW_AT_producer
90
+ .b8 114
91
+ .b8 105
92
+ .b8 116
93
+ .b8 111
94
+ .b8 110
95
+ .b8 0
96
+ .b8 2 // DW_AT_language
97
+ .b8 0
98
+ .b8 99 // DW_AT_name
99
+ .b8 106
100
+ .b8 54
101
+ .b8 108
102
+ .b8 98
103
+ .b8 50
104
+ .b8 108
105
+ .b8 119
106
+ .b8 97
107
+ .b8 98
108
+ .b8 52
109
+ .b8 51
110
+ .b8 122
111
+ .b8 118
112
+ .b8 101
113
+ .b8 108
114
+ .b8 51
115
+ .b8 52
116
+ .b8 122
117
+ .b8 51
118
+ .b8 119
119
+ .b8 115
120
+ .b8 100
121
+ .b8 122
122
+ .b8 103
123
+ .b8 106
124
+ .b8 110
125
+ .b8 115
126
+ .b8 55
127
+ .b8 101
128
+ .b8 102
129
+ .b8 119
130
+ .b8 121
131
+ .b8 118
132
+ .b8 106
133
+ .b8 100
134
+ .b8 50
135
+ .b8 121
136
+ .b8 99
137
+ .b8 101
138
+ .b8 120
139
+ .b8 113
140
+ .b8 106
141
+ .b8 51
142
+ .b8 98
143
+ .b8 110
144
+ .b8 97
145
+ .b8 121
146
+ .b8 105
147
+ .b8 118
148
+ .b8 104
149
+ .b8 54
150
+ .b8 46
151
+ .b8 112
152
+ .b8 121
153
+ .b8 0
154
+ .b32 .debug_line // DW_AT_stmt_list
155
+ .b8 47 // DW_AT_comp_dir
156
+ .b8 119
157
+ .b8 111
158
+ .b8 114
159
+ .b8 107
160
+ .b8 115
161
+ .b8 112
162
+ .b8 97
163
+ .b8 99
164
+ .b8 101
165
+ .b8 47
166
+ .b8 104
167
+ .b8 97
168
+ .b8 110
169
+ .b8 114
170
+ .b8 117
171
+ .b8 105
172
+ .b8 47
173
+ .b8 83
174
+ .b8 112
175
+ .b8 101
176
+ .b8 99
177
+ .b8 70
178
+ .b8 111
179
+ .b8 114
180
+ .b8 103
181
+ .b8 101
182
+ .b8 45
183
+ .b8 101
184
+ .b8 120
185
+ .b8 116
186
+ .b8 47
187
+ .b8 99
188
+ .b8 97
189
+ .b8 99
190
+ .b8 104
191
+ .b8 101
192
+ .b8 47
193
+ .b8 99
194
+ .b8 111
195
+ .b8 109
196
+ .b8 112
197
+ .b8 105
198
+ .b8 108
199
+ .b8 101
200
+ .b8 100
201
+ .b8 95
202
+ .b8 107
203
+ .b8 101
204
+ .b8 114
205
+ .b8 110
206
+ .b8 101
207
+ .b8 108
208
+ .b8 115
209
+ .b8 47
210
+ .b8 106
211
+ .b8 54
212
+ .b8 0
213
+ }
214
+ .section .debug_macinfo { }
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.source ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":18:0)
2
+ #loc10 = loc("out_ptr0"(#loc))
3
+ #loc11 = loc("xnumel"(#loc))
4
+ module {
5
+ tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
6
+ %xoffset = tt.get_program_id x : i32 loc(#loc12)
7
+ %xoffset_0 = arith.constant 256 : i32 loc(#loc13)
8
+ %xoffset_1 = arith.constant 256 : i32 loc(#loc13)
9
+ %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc13)
10
+ %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc14)
11
+ %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<256xi32> loc(#loc15)
12
+ %xindex_4 = arith.addi %xindex_3, %xindex : tensor<256xi32> loc(#loc15)
13
+ %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc16)
14
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<256xi32> loc(#loc16)
15
+ %tmp0 = arith.constant 0 : i32 loc(#loc17)
16
+ %tmp0_6 = arith.constant dense<0> : tensor<1xi32> loc(#loc17)
17
+ %0 = tt.splat %out_ptr0 : !tt.ptr<i32> -> tensor<256x!tt.ptr<i32>> loc(#loc7)
18
+ %1 = tt.addptr %0, %xindex_4 : tensor<256x!tt.ptr<i32>>, tensor<256xi32> loc(#loc7)
19
+ %cst = arith.constant dense<0> : tensor<256xi32> loc(#loc8)
20
+ tt.store %1, %cst, %xmask_5 : tensor<256x!tt.ptr<i32>> loc(#loc8)
21
+ tt.return loc(#loc9)
22
+ } loc(#loc)
23
+ } loc(#loc)
24
+ #loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:28)
25
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:33)
26
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:36)
27
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:23)
28
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":21:21)
29
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":23:27)
30
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:25)
31
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:36)
32
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:4)
33
+ #loc12 = loc("xoffset"(#loc1))
34
+ #loc13 = loc("xoffset"(#loc2))
35
+ #loc14 = loc("xindex"(#loc3))
36
+ #loc15 = loc("xindex"(#loc4))
37
+ #loc16 = loc("xmask"(#loc5))
38
+ #loc17 = loc("tmp0"(#loc6))
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttgir ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
2
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":18:0)
3
+ #loc10 = loc("out_ptr0"(#loc))
4
+ #loc11 = loc("xnumel"(#loc))
5
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
6
+ tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
7
+ %c256_i32 = arith.constant 256 : i32 loc(#loc1)
8
+ %cst = arith.constant dense<0> : tensor<256xi32, #blocked> loc(#loc1)
9
+ %xoffset = tt.get_program_id x : i32 loc(#loc12)
10
+ %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc13)
11
+ %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> loc(#loc14)
12
+ %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32, #blocked> loc(#loc15)
13
+ %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32, #blocked> loc(#loc15)
14
+ %xmask = tt.splat %xnumel : i32 -> tensor<256xi32, #blocked> loc(#loc16)
15
+ %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32, #blocked> loc(#loc16)
16
+ %0 = tt.splat %out_ptr0 : !tt.ptr<i32> -> tensor<256x!tt.ptr<i32>, #blocked> loc(#loc7)
17
+ %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr<i32>, #blocked>, tensor<256xi32, #blocked> loc(#loc7)
18
+ tt.store %1, %cst, %xmask_3 : tensor<256x!tt.ptr<i32>, #blocked> loc(#loc8)
19
+ tt.return loc(#loc9)
20
+ } loc(#loc)
21
+ } loc(#loc)
22
+ #loc1 = loc(unknown)
23
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:28)
24
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:33)
25
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:36)
26
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:23)
27
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":21:21)
28
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:25)
29
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:36)
30
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:4)
31
+ #loc12 = loc("xoffset"(#loc2))
32
+ #loc13 = loc("xoffset"(#loc3))
33
+ #loc14 = loc("xindex"(#loc4))
34
+ #loc15 = loc("xindex"(#loc5))
35
+ #loc16 = loc("xmask"(#loc6))
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttir ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":18:0)
2
+ #loc10 = loc("out_ptr0"(#loc))
3
+ #loc11 = loc("xnumel"(#loc))
4
+ module {
5
+ tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
6
+ %cst = arith.constant dense<0> : tensor<256xi32> loc(#loc1)
7
+ %c256_i32 = arith.constant 256 : i32 loc(#loc2)
8
+ %xoffset = tt.get_program_id x : i32 loc(#loc12)
9
+ %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc13)
10
+ %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc14)
11
+ %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32> loc(#loc15)
12
+ %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32> loc(#loc15)
13
+ %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc16)
14
+ %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32> loc(#loc16)
15
+ %0 = tt.splat %out_ptr0 : !tt.ptr<i32> -> tensor<256x!tt.ptr<i32>> loc(#loc8)
16
+ %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr<i32>>, tensor<256xi32> loc(#loc8)
17
+ tt.store %1, %cst, %xmask_3 : tensor<256x!tt.ptr<i32>> loc(#loc1)
18
+ tt.return loc(#loc9)
19
+ } loc(#loc)
20
+ } loc(#loc)
21
+ #loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:36)
22
+ #loc2 = loc(unknown)
23
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:28)
24
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:33)
25
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:36)
26
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:23)
27
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":21:21)
28
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:25)
29
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:4)
30
+ #loc12 = loc("xoffset"(#loc3))
31
+ #loc13 = loc("xoffset"(#loc4))
32
+ #loc14 = loc("xindex"(#loc5))
33
+ #loc15 = loc("xindex"(#loc6))
34
+ #loc16 = loc("xmask"(#loc7))
SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/__grp__triton_red_fused_zeros_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.json"}}
SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.cubin ADDED
Binary file (15.7 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "e1b3f1564b04bc7aede55cff09d6f5ffbd51b34448fc5f1195c8448b5e81fed8", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"}