Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir +204 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx +525 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source +193 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir +147 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir +152 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/__grp__triton_tem_fused_zeros_1.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.llir +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ptx +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.source +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttgir +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttir +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir +333 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx +809 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source +418 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir +280 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir +283 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir +781 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx +1410 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir +841 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir +799 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/__grp__triton_red_fused_argmax_1.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.cubin +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.llir +611 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ptx +1196 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.source +315 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttgir +203 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttir +204 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/__grp__triton_poi_fused_new_zeros_0.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.cubin +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.llir +52 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ptx +214 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.source +38 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttgir +35 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttir +34 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/__grp__triton_red_fused_zeros_0.json +1 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.cubin +0 -0
- SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.json +1 -0
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_red_fused__to_copy_clone_slice_sum_transpose_5.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json"}}
|
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin
ADDED
|
Binary file (17.2 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "d1c2e6527ce27b628a96c1a250025b39aad19d679fce295e820390aa7ae64b66", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_clone_slice_sum_transpose_5"}
|
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
@global_smem = external addrspace(3) global [0 x i8], align 16
|
| 6 |
+
|
| 7 |
+
; Function Attrs: nounwind
|
| 8 |
+
define ptx_kernel void @triton_red_fused__to_copy_clone_slice_sum_transpose_5(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 {
|
| 9 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
|
| 10 |
+
%10 = shl i32 %9, 5, !dbg !8
|
| 11 |
+
%11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
|
| 12 |
+
%12 = and i32 %11, 31, !dbg !9
|
| 13 |
+
%13 = or disjoint i32 %10, %12, !dbg !10
|
| 14 |
+
%14 = icmp slt i32 %13, %4, !dbg !11
|
| 15 |
+
%15 = lshr i32 %11, 5, !dbg !12
|
| 16 |
+
%16 = and i32 %15, 3, !dbg !12
|
| 17 |
+
%17 = sext i32 %13 to i64, !dbg !13
|
| 18 |
+
%.frozen = freeze i64 %2, !dbg !14
|
| 19 |
+
%18 = sdiv i64 %17, %.frozen, !dbg !14
|
| 20 |
+
%19 = mul i64 %18, %.frozen, !dbg !13
|
| 21 |
+
%.decomposed = sub i64 %17, %19, !dbg !13
|
| 22 |
+
%20 = icmp sgt i32 %5, 0, !dbg !15
|
| 23 |
+
br i1 %20, label %.lr.ph, label %._crit_edge, !dbg !15
|
| 24 |
+
|
| 25 |
+
.lr.ph: ; preds = %8
|
| 26 |
+
%21 = mul i64 %3, %2, !dbg !16
|
| 27 |
+
%22 = mul i64 %21, %18, !dbg !17
|
| 28 |
+
%23 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed
|
| 29 |
+
%invariant.gep = getelementptr i32, ptr addrspace(1) %23, i64 %22, !dbg !15
|
| 30 |
+
%24 = insertelement <4 x i1> poison, i1 %14, i64 0, !dbg !18
|
| 31 |
+
%25 = shufflevector <4 x i1> %24, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !18
|
| 32 |
+
%26 = insertelement <4 x i32> poison, i32 %5, i64 0, !dbg !19
|
| 33 |
+
%27 = shufflevector <4 x i32> %26, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !19
|
| 34 |
+
br label %28, !dbg !15
|
| 35 |
+
|
| 36 |
+
28: ; preds = %.lr.ph, %28
|
| 37 |
+
%29 = phi i32 [ 0, %.lr.ph ], [ %68, %28 ]
|
| 38 |
+
%30 = phi <4 x i64> [ zeroinitializer, %.lr.ph ], [ %67, %28 ]
|
| 39 |
+
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20
|
| 40 |
+
%32 = or disjoint i32 %29, %16, !dbg !21
|
| 41 |
+
%33 = or disjoint i32 %32, 4, !dbg !21
|
| 42 |
+
%34 = or disjoint i32 %32, 8, !dbg !21
|
| 43 |
+
%35 = or disjoint i32 %32, 12, !dbg !21
|
| 44 |
+
%36 = insertelement <4 x i32> poison, i32 %32, i64 0, !dbg !19
|
| 45 |
+
%37 = insertelement <4 x i32> %36, i32 %33, i64 1, !dbg !19
|
| 46 |
+
%38 = insertelement <4 x i32> %37, i32 %34, i64 2, !dbg !19
|
| 47 |
+
%39 = insertelement <4 x i32> %38, i32 %35, i64 3, !dbg !19
|
| 48 |
+
%40 = icmp slt <4 x i32> %39, %27, !dbg !19
|
| 49 |
+
%41 = sext i32 %32 to i64, !dbg !22
|
| 50 |
+
%42 = sext i32 %33 to i64, !dbg !22
|
| 51 |
+
%43 = sext i32 %34 to i64, !dbg !22
|
| 52 |
+
%44 = sext i32 %35 to i64, !dbg !22
|
| 53 |
+
%45 = mul i64 %2, %41, !dbg !22
|
| 54 |
+
%46 = mul i64 %2, %42, !dbg !22
|
| 55 |
+
%47 = mul i64 %2, %43, !dbg !22
|
| 56 |
+
%48 = mul i64 %2, %44, !dbg !22
|
| 57 |
+
%gep = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %45, !dbg !23
|
| 58 |
+
%gep4 = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %46, !dbg !23
|
| 59 |
+
%gep6 = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %47, !dbg !23
|
| 60 |
+
%gep8 = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %48, !dbg !23
|
| 61 |
+
%49 = and <4 x i1> %25, %40, !dbg !18
|
| 62 |
+
%50 = extractelement <4 x i1> %49, i64 0, !dbg !20
|
| 63 |
+
%51 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep, i64 %31, i1 %50) #5, !dbg !20
|
| 64 |
+
%52 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20
|
| 65 |
+
%53 = extractelement <4 x i1> %49, i64 1, !dbg !20
|
| 66 |
+
%54 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep4, i64 %52, i1 %53) #5, !dbg !20
|
| 67 |
+
%55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20
|
| 68 |
+
%56 = extractelement <4 x i1> %49, i64 2, !dbg !20
|
| 69 |
+
%57 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep6, i64 %55, i1 %56) #5, !dbg !20
|
| 70 |
+
%58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20
|
| 71 |
+
%59 = extractelement <4 x i1> %49, i64 3, !dbg !20
|
| 72 |
+
%60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep8, i64 %58, i1 %59) #5, !dbg !20
|
| 73 |
+
%61 = insertelement <4 x i32> poison, i32 %51, i64 0, !dbg !24
|
| 74 |
+
%62 = insertelement <4 x i32> %61, i32 %54, i64 1, !dbg !24
|
| 75 |
+
%63 = insertelement <4 x i32> %62, i32 %57, i64 2, !dbg !24
|
| 76 |
+
%64 = insertelement <4 x i32> %63, i32 %60, i64 3, !dbg !24
|
| 77 |
+
%65 = sext <4 x i32> %64 to <4 x i64>, !dbg !24
|
| 78 |
+
%66 = select <4 x i1> %49, <4 x i64> %65, <4 x i64> zeroinitializer, !dbg !25
|
| 79 |
+
%67 = add <4 x i64> %66, %30, !dbg !25
|
| 80 |
+
%68 = add i32 %29, 16, !dbg !15
|
| 81 |
+
%69 = icmp slt i32 %68, %5, !dbg !15
|
| 82 |
+
br i1 %69, label %28, label %._crit_edge.loopexit, !dbg !15
|
| 83 |
+
|
| 84 |
+
._crit_edge.loopexit: ; preds = %28
|
| 85 |
+
%70 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %67), !dbg !26
|
| 86 |
+
br label %._crit_edge, !dbg !26
|
| 87 |
+
|
| 88 |
+
._crit_edge: ; preds = %._crit_edge.loopexit, %8
|
| 89 |
+
%71 = phi i64 [ 0, %8 ], [ %70, %._crit_edge.loopexit ], !dbg !26
|
| 90 |
+
%.idx = shl nuw nsw i32 %12, 5, !dbg !30
|
| 91 |
+
%72 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !30
|
| 92 |
+
%73 = getelementptr i64, ptr addrspace(3) %72, i32 %16, !dbg !30
|
| 93 |
+
%74 = insertelement <1 x i64> poison, i64 %71, i64 0, !dbg !30
|
| 94 |
+
tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %73, <1 x i64> %74, i1 true) #5, !dbg !30
|
| 95 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
|
| 96 |
+
%75 = icmp samesign ult i32 %11, 128, !dbg !30
|
| 97 |
+
%76 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %11, !dbg !30
|
| 98 |
+
%77 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %76, i1 %75) #5, !dbg !30
|
| 99 |
+
%extelt.offset = lshr i64 %77, 32, !dbg !30
|
| 100 |
+
%78 = trunc nuw i64 %extelt.offset to i32, !dbg !30
|
| 101 |
+
%79 = trunc i64 %77 to i32, !dbg !30
|
| 102 |
+
%80 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %79, i32 2, i32 31), !dbg !30
|
| 103 |
+
%81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 2, i32 31), !dbg !30
|
| 104 |
+
%82 = insertelement <2 x i32> poison, i32 %80, i64 0, !dbg !30
|
| 105 |
+
%83 = insertelement <2 x i32> %82, i32 %81, i64 1, !dbg !30
|
| 106 |
+
%84 = bitcast <2 x i32> %83 to i64, !dbg !30
|
| 107 |
+
%85 = add i64 %77, %84, !dbg !26
|
| 108 |
+
%extelt.offset2 = lshr i64 %85, 32, !dbg !30
|
| 109 |
+
%86 = trunc nuw i64 %extelt.offset2 to i32, !dbg !30
|
| 110 |
+
%87 = trunc i64 %85 to i32, !dbg !30
|
| 111 |
+
%88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !30
|
| 112 |
+
%89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !30
|
| 113 |
+
%90 = insertelement <2 x i32> poison, i32 %88, i64 0, !dbg !30
|
| 114 |
+
%91 = insertelement <2 x i32> %90, i32 %89, i64 1, !dbg !30
|
| 115 |
+
%92 = bitcast <2 x i32> %91 to i64, !dbg !30
|
| 116 |
+
%93 = add i64 %85, %92, !dbg !26
|
| 117 |
+
%94 = and i32 %11, 899, !dbg !30
|
| 118 |
+
%95 = icmp eq i32 %94, 0, !dbg !30
|
| 119 |
+
%96 = insertelement <1 x i64> poison, i64 %93, i64 0, !dbg !30
|
| 120 |
+
tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %76, <1 x i64> %96, i1 %95) #5, !dbg !30
|
| 121 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
|
| 122 |
+
%97 = load i64, ptr addrspace(3) %72, align 16, !dbg !30
|
| 123 |
+
%98 = trunc i64 %97 to i32, !dbg !31
|
| 124 |
+
%99 = icmp slt i64 %2, 2, !dbg !32
|
| 125 |
+
%100 = icmp sgt i64 %2, 1, !dbg !33
|
| 126 |
+
%101 = select i1 %100, i64 %2, i64 0, !dbg !34
|
| 127 |
+
%102 = zext i1 %99 to i64, !dbg !35
|
| 128 |
+
%103 = add i64 %101, %102, !dbg !36
|
| 129 |
+
%104 = mul i64 %18, %103, !dbg !37
|
| 130 |
+
%105 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !38
|
| 131 |
+
%106 = getelementptr i32, ptr addrspace(1) %105, i64 %104, !dbg !38
|
| 132 |
+
%107 = and i32 %11, 96, !dbg !39
|
| 133 |
+
%108 = icmp eq i32 %107, 0, !dbg !39
|
| 134 |
+
%109 = and i1 %108, %14, !dbg !39
|
| 135 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %98, ptr addrspace(1) %106, i1 %109) #5, !dbg !39
|
| 136 |
+
ret void, !dbg !40
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 140 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 141 |
+
|
| 142 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 143 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 144 |
+
|
| 145 |
+
; Function Attrs: convergent nocallback nounwind
|
| 146 |
+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
|
| 147 |
+
|
| 148 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
| 149 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
|
| 150 |
+
|
| 151 |
+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 152 |
+
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4
|
| 153 |
+
|
| 154 |
+
attributes #0 = { nounwind "nvvm.reqntid"="128" }
|
| 155 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 156 |
+
attributes #2 = { convergent nocallback nounwind }
|
| 157 |
+
attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
| 158 |
+
attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 159 |
+
attributes #5 = { nounwind }
|
| 160 |
+
|
| 161 |
+
!llvm.dbg.cu = !{!0}
|
| 162 |
+
!llvm.module.flags = !{!2, !3}
|
| 163 |
+
|
| 164 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 165 |
+
!1 = !DIFile(filename: "cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt")
|
| 166 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 167 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 168 |
+
!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", linkageName: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 169 |
+
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
|
| 170 |
+
!6 = !{}
|
| 171 |
+
!7 = !DILocation(line: 21, column: 28, scope: !4)
|
| 172 |
+
!8 = !DILocation(line: 21, column: 33, scope: !4)
|
| 173 |
+
!9 = !DILocation(line: 22, column: 44, scope: !4)
|
| 174 |
+
!10 = !DILocation(line: 22, column: 23, scope: !4)
|
| 175 |
+
!11 = !DILocation(line: 23, column: 21, scope: !4)
|
| 176 |
+
!12 = !DILocation(line: 24, column: 37, scope: !4)
|
| 177 |
+
!13 = !DILocation(line: 26, column: 19, scope: !4)
|
| 178 |
+
!14 = !DILocation(line: 27, column: 19, scope: !4)
|
| 179 |
+
!15 = !DILocation(line: 30, column: 40, scope: !4)
|
| 180 |
+
!16 = !DILocation(line: 36, column: 54, scope: !4)
|
| 181 |
+
!17 = !DILocation(line: 36, column: 58, scope: !4)
|
| 182 |
+
!18 = !DILocation(line: 36, column: 73, scope: !4)
|
| 183 |
+
!19 = !DILocation(line: 32, column: 29, scope: !4)
|
| 184 |
+
!20 = !DILocation(line: 36, column: 63, scope: !4)
|
| 185 |
+
!21 = !DILocation(line: 31, column: 31, scope: !4)
|
| 186 |
+
!22 = !DILocation(line: 36, column: 43, scope: !4)
|
| 187 |
+
!23 = !DILocation(line: 36, column: 34, scope: !4)
|
| 188 |
+
!24 = !DILocation(line: 37, column: 23, scope: !4)
|
| 189 |
+
!25 = !DILocation(line: 40, column: 48, scope: !4)
|
| 190 |
+
!26 = !DILocation(line: 261, column: 15, scope: !27, inlinedAt: !29)
|
| 191 |
+
!27 = distinct !DILexicalBlockFile(scope: !4, file: !28, discriminator: 0)
|
| 192 |
+
!28 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language")
|
| 193 |
+
!29 = !DILocation(line: 41, column: 25, scope: !4)
|
| 194 |
+
!30 = !DILocation(line: 291, column: 36, scope: !27, inlinedAt: !29)
|
| 195 |
+
!31 = !DILocation(line: 42, column: 19, scope: !4)
|
| 196 |
+
!32 = !DILocation(line: 43, column: 49, scope: !4)
|
| 197 |
+
!33 = !DILocation(line: 43, column: 75, scope: !4)
|
| 198 |
+
!34 = !DILocation(line: 43, column: 66, scope: !4)
|
| 199 |
+
!35 = !DILocation(line: 43, scope: !4)
|
| 200 |
+
!36 = !DILocation(line: 43, column: 57, scope: !4)
|
| 201 |
+
!37 = !DILocation(line: 43, column: 34, scope: !4)
|
| 202 |
+
!38 = !DILocation(line: 43, column: 25, scope: !4)
|
| 203 |
+
!39 = !DILocation(line: 43, column: 88, scope: !4)
|
| 204 |
+
!40 = !DILocation(line: 43, column: 4, scope: !4)
|
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx
ADDED
|
@@ -0,0 +1,525 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_red_fused__to_copy_clone_slice_sum_transpose_5 // -- Begin function triton_red_fused__to_copy_clone_slice_sum_transpose_5
|
| 10 |
+
.extern .shared .align 16 .b8 global_smem[];
|
| 11 |
+
// @triton_red_fused__to_copy_clone_slice_sum_transpose_5
|
| 12 |
+
.visible .entry triton_red_fused__to_copy_clone_slice_sum_transpose_5(
|
| 13 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0,
|
| 14 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1,
|
| 15 |
+
.param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2,
|
| 16 |
+
.param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3,
|
| 17 |
+
.param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4,
|
| 18 |
+
.param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5,
|
| 19 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_6,
|
| 20 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_7
|
| 21 |
+
)
|
| 22 |
+
.reqntid 128
|
| 23 |
+
{
|
| 24 |
+
.reg .pred %p<24>;
|
| 25 |
+
.reg .b32 %r<51>;
|
| 26 |
+
.reg .b64 %rd<97>;
|
| 27 |
+
.loc 1 18 0 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:18:0
|
| 28 |
+
$L__func_begin0:
|
| 29 |
+
.loc 1 18 0 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:18:0
|
| 30 |
+
|
| 31 |
+
// %bb.0:
|
| 32 |
+
ld.param.b32 %r11, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5];
|
| 33 |
+
ld.param.b64 %rd20, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2];
|
| 34 |
+
$L__tmp0:
|
| 35 |
+
.loc 1 21 28 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:21:28
|
| 36 |
+
mov.u32 %r12, %ctaid.x;
|
| 37 |
+
.loc 1 21 33 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:21:33
|
| 38 |
+
shl.b32 %r13, %r12, 5;
|
| 39 |
+
.loc 1 22 44 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:22:44
|
| 40 |
+
mov.u32 %r1, %tid.x;
|
| 41 |
+
and.b32 %r2, %r1, 31;
|
| 42 |
+
.loc 1 22 23 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:22:23
|
| 43 |
+
or.b32 %r14, %r13, %r2;
|
| 44 |
+
.loc 1 26 19 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:26:19
|
| 45 |
+
cvt.s64.s32 %rd1, %r14;
|
| 46 |
+
.loc 1 27 19 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:27:19
|
| 47 |
+
or.b64 %rd23, %rd1, %rd20;
|
| 48 |
+
and.b64 %rd24, %rd23, -4294967296;
|
| 49 |
+
setp.ne.b64 %p5, %rd24, 0;
|
| 50 |
+
cvt.u32.u64 %r49, %rd1;
|
| 51 |
+
@%p5 bra $L__BB0_2;
|
| 52 |
+
bra.uni $L__BB0_1;
|
| 53 |
+
$L__BB0_2:
|
| 54 |
+
div.s64 %rd91, %rd1, %rd20;
|
| 55 |
+
bra.uni $L__BB0_3;
|
| 56 |
+
$L__BB0_1:
|
| 57 |
+
cvt.u32.u64 %r15, %rd20;
|
| 58 |
+
div.u32 %r17, %r49, %r15;
|
| 59 |
+
cvt.u64.u32 %rd91, %r17;
|
| 60 |
+
$L__BB0_3:
|
| 61 |
+
.loc 1 0 19 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:0:19
|
| 62 |
+
ld.param.b32 %r10, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4];
|
| 63 |
+
ld.param.b64 %rd19, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1];
|
| 64 |
+
bfe.u32 %r3, %r1, 5, 2;
|
| 65 |
+
.loc 1 26 19 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:26:19
|
| 66 |
+
mul.lo.s64 %rd26, %rd91, %rd20;
|
| 67 |
+
sub.s64 %rd6, %rd1, %rd26;
|
| 68 |
+
.loc 1 30 40 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:30:40
|
| 69 |
+
setp.lt.s32 %p6, %r11, 1;
|
| 70 |
+
mov.b64 %rd96, 0;
|
| 71 |
+
shl.b64 %rd90, %rd6, 2;
|
| 72 |
+
@%p6 bra $L__BB0_7;
|
| 73 |
+
// %bb.4: // %.lr.ph
|
| 74 |
+
.loc 1 0 40 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:0:40
|
| 75 |
+
ld.param.b64 %rd21, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3];
|
| 76 |
+
ld.param.b64 %rd18, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0];
|
| 77 |
+
.loc 1 23 21 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:23:21
|
| 78 |
+
setp.lt.s32 %p1, %r49, %r10;
|
| 79 |
+
.loc 1 36 54 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:54
|
| 80 |
+
mul.lo.s64 %rd31, %rd21, %rd20;
|
| 81 |
+
.loc 1 36 58 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:58
|
| 82 |
+
mul.lo.s64 %rd32, %rd31, %rd91;
|
| 83 |
+
add.s64 %rd34, %rd18, %rd90;
|
| 84 |
+
.loc 1 30 40 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:30:40
|
| 85 |
+
shl.b64 %rd35, %rd32, 2;
|
| 86 |
+
add.s64 %rd7, %rd34, %rd35;
|
| 87 |
+
mov.b64 %rd92, 0;
|
| 88 |
+
mov.b32 %r50, 0;
|
| 89 |
+
mov.b64 %rd93, %rd92;
|
| 90 |
+
mov.b64 %rd94, %rd92;
|
| 91 |
+
mov.b64 %rd95, %rd92;
|
| 92 |
+
$L__BB0_5: // =>This Inner Loop Header: Depth=1
|
| 93 |
+
.loc 1 36 63 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:63
|
| 94 |
+
// begin inline asm
|
| 95 |
+
mov.u64 %rd36, 0x0;
|
| 96 |
+
createpolicy.fractional.L2::evict_last.b64 %rd36, 1.0;
|
| 97 |
+
// end inline asm
|
| 98 |
+
.loc 1 31 31 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:31:31
|
| 99 |
+
add.s32 %r24, %r3, %r50;
|
| 100 |
+
add.s32 %r25, %r24, 4;
|
| 101 |
+
add.s32 %r26, %r24, 8;
|
| 102 |
+
.loc 1 32 29 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:32:29
|
| 103 |
+
add.s32 %r27, %r24, 12;
|
| 104 |
+
setp.lt.s32 %p11, %r24, %r11;
|
| 105 |
+
setp.lt.s32 %p12, %r25, %r11;
|
| 106 |
+
setp.lt.s32 %p13, %r26, %r11;
|
| 107 |
+
setp.lt.s32 %p14, %r27, %r11;
|
| 108 |
+
.loc 1 36 43 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:43
|
| 109 |
+
cvt.s64.s32 %rd48, %r24;
|
| 110 |
+
cvt.s64.s32 %rd49, %r25;
|
| 111 |
+
cvt.s64.s32 %rd50, %r26;
|
| 112 |
+
cvt.s64.s32 %rd51, %r27;
|
| 113 |
+
mul.lo.s64 %rd52, %rd20, %rd48;
|
| 114 |
+
mul.lo.s64 %rd53, %rd20, %rd49;
|
| 115 |
+
mul.lo.s64 %rd54, %rd20, %rd50;
|
| 116 |
+
mul.lo.s64 %rd55, %rd20, %rd51;
|
| 117 |
+
.loc 1 36 34 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:34
|
| 118 |
+
shl.b64 %rd56, %rd52, 2;
|
| 119 |
+
add.s64 %rd37, %rd7, %rd56;
|
| 120 |
+
shl.b64 %rd57, %rd53, 2;
|
| 121 |
+
add.s64 %rd40, %rd7, %rd57;
|
| 122 |
+
shl.b64 %rd58, %rd54, 2;
|
| 123 |
+
add.s64 %rd43, %rd7, %rd58;
|
| 124 |
+
shl.b64 %rd59, %rd55, 2;
|
| 125 |
+
add.s64 %rd46, %rd7, %rd59;
|
| 126 |
+
.loc 1 36 73 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:73
|
| 127 |
+
and.pred %p10, %p1, %p14;
|
| 128 |
+
and.pred %p9, %p1, %p13;
|
| 129 |
+
and.pred %p8, %p1, %p12;
|
| 130 |
+
and.pred %p7, %p1, %p11;
|
| 131 |
+
.loc 1 36 63 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:36:63
|
| 132 |
+
// begin inline asm
|
| 133 |
+
mov.u32 %r20, 0x0;
|
| 134 |
+
@%p7 ld.global.L1::evict_last.L2::cache_hint.b32 { %r20 }, [ %rd37 + 0 ], %rd36;
|
| 135 |
+
// end inline asm
|
| 136 |
+
// begin inline asm
|
| 137 |
+
mov.u64 %rd39, 0x0;
|
| 138 |
+
createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0;
|
| 139 |
+
// end inline asm
|
| 140 |
+
// begin inline asm
|
| 141 |
+
mov.u32 %r21, 0x0;
|
| 142 |
+
@%p8 ld.global.L1::evict_last.L2::cache_hint.b32 { %r21 }, [ %rd40 + 0 ], %rd39;
|
| 143 |
+
// end inline asm
|
| 144 |
+
// begin inline asm
|
| 145 |
+
mov.u64 %rd42, 0x0;
|
| 146 |
+
createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0;
|
| 147 |
+
// end inline asm
|
| 148 |
+
// begin inline asm
|
| 149 |
+
mov.u32 %r22, 0x0;
|
| 150 |
+
@%p9 ld.global.L1::evict_last.L2::cache_hint.b32 { %r22 }, [ %rd43 + 0 ], %rd42;
|
| 151 |
+
// end inline asm
|
| 152 |
+
// begin inline asm
|
| 153 |
+
mov.u64 %rd45, 0x0;
|
| 154 |
+
createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0;
|
| 155 |
+
// end inline asm
|
| 156 |
+
// begin inline asm
|
| 157 |
+
mov.u32 %r23, 0x0;
|
| 158 |
+
@%p10 ld.global.L1::evict_last.L2::cache_hint.b32 { %r23 }, [ %rd46 + 0 ], %rd45;
|
| 159 |
+
// end inline asm
|
| 160 |
+
.loc 1 37 23 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:37:23
|
| 161 |
+
cvt.s64.s32 %rd60, %r20;
|
| 162 |
+
cvt.s64.s32 %rd61, %r21;
|
| 163 |
+
cvt.s64.s32 %rd62, %r22;
|
| 164 |
+
cvt.s64.s32 %rd63, %r23;
|
| 165 |
+
.loc 1 40 48 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:40:48
|
| 166 |
+
selp.b64 %rd64, %rd63, 0, %p10;
|
| 167 |
+
selp.b64 %rd65, %rd62, 0, %p9;
|
| 168 |
+
selp.b64 %rd66, %rd61, 0, %p8;
|
| 169 |
+
selp.b64 %rd67, %rd60, 0, %p7;
|
| 170 |
+
add.s64 %rd92, %rd67, %rd92;
|
| 171 |
+
add.s64 %rd93, %rd66, %rd93;
|
| 172 |
+
add.s64 %rd94, %rd65, %rd94;
|
| 173 |
+
add.s64 %rd95, %rd64, %rd95;
|
| 174 |
+
.loc 1 30 40 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:30:40
|
| 175 |
+
add.s32 %r50, %r50, 16;
|
| 176 |
+
setp.lt.s32 %p15, %r50, %r11;
|
| 177 |
+
@%p15 bra $L__BB0_5;
|
| 178 |
+
// %bb.6: // %._crit_edge.loopexit
|
| 179 |
+
$L__tmp1:
|
| 180 |
+
.loc 2 261 15 // standard.py:261:15 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
|
| 181 |
+
add.s64 %rd68, %rd92, %rd94;
|
| 182 |
+
add.s64 %rd69, %rd93, %rd95;
|
| 183 |
+
add.s64 %rd96, %rd68, %rd69;
|
| 184 |
+
$L__tmp2:
|
| 185 |
+
$L__BB0_7: // %._crit_edge
|
| 186 |
+
.loc 1 23 21 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:23:21
|
| 187 |
+
setp.lt.s32 %p20, %r49, %r10;
|
| 188 |
+
$L__tmp3:
|
| 189 |
+
.loc 2 291 36 // standard.py:291:36 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
|
| 190 |
+
shl.b32 %r33, %r2, 5;
|
| 191 |
+
mov.b32 %r34, global_smem;
|
| 192 |
+
add.s32 %r35, %r34, %r33;
|
| 193 |
+
shl.b32 %r36, %r3, 3;
|
| 194 |
+
add.s32 %r28, %r35, %r36;
|
| 195 |
+
mov.pred %p16, -1;
|
| 196 |
+
// begin inline asm
|
| 197 |
+
@%p16 st.shared.b64 [ %r28 + 0 ], %rd96;
|
| 198 |
+
// end inline asm
|
| 199 |
+
bar.sync 0;
|
| 200 |
+
setp.lt.u32 %p17, %r1, 128;
|
| 201 |
+
shl.b32 %r37, %r1, 3;
|
| 202 |
+
add.s32 %r29, %r34, %r37;
|
| 203 |
+
// begin inline asm
|
| 204 |
+
@%p17 ld.shared.b64 %rd71, [ %r29 + 0 ];
|
| 205 |
+
// end inline asm
|
| 206 |
+
mov.b64 {_, %r38}, %rd71;
|
| 207 |
+
cvt.u32.u64 %r39, %rd71;
|
| 208 |
+
shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
|
| 209 |
+
shfl.sync.bfly.b32 %r41, %r38, 2, 31, -1;
|
| 210 |
+
cvt.u64.u32 %rd74, %r40;
|
| 211 |
+
cvt.u64.u32 %rd75, %r41;
|
| 212 |
+
shl.b64 %rd76, %rd75, 32;
|
| 213 |
+
or.b64 %rd77, %rd74, %rd76;
|
| 214 |
+
.loc 2 261 15 // standard.py:261:15 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
|
| 215 |
+
add.s64 %rd78, %rd71, %rd77;
|
| 216 |
+
.loc 2 291 36 // standard.py:291:36 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
|
| 217 |
+
mov.b64 {_, %r42}, %rd78;
|
| 218 |
+
cvt.u32.u64 %r43, %rd78;
|
| 219 |
+
shfl.sync.bfly.b32 %r44, %r43, 1, 31, -1;
|
| 220 |
+
shfl.sync.bfly.b32 %r45, %r42, 1, 31, -1;
|
| 221 |
+
cvt.u64.u32 %rd79, %r44;
|
| 222 |
+
cvt.u64.u32 %rd80, %r45;
|
| 223 |
+
shl.b64 %rd81, %rd80, 32;
|
| 224 |
+
or.b64 %rd82, %rd79, %rd81;
|
| 225 |
+
.loc 2 261 15 // standard.py:261:15 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
|
| 226 |
+
add.s64 %rd72, %rd78, %rd82;
|
| 227 |
+
.loc 2 291 36 // standard.py:291:36 @[ cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:41:25 ]
|
| 228 |
+
and.b32 %r46, %r1, 899;
|
| 229 |
+
setp.eq.b32 %p18, %r46, 0;
|
| 230 |
+
// begin inline asm
|
| 231 |
+
@%p18 st.shared.b64 [ %r29 + 0 ], %rd72;
|
| 232 |
+
// end inline asm
|
| 233 |
+
bar.sync 0;
|
| 234 |
+
ld.shared.b32 %r31, [%r35];
|
| 235 |
+
$L__tmp4:
|
| 236 |
+
.loc 1 43 49 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:49
|
| 237 |
+
setp.lt.s64 %p21, %rd20, 2;
|
| 238 |
+
.loc 1 43 75 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:75
|
| 239 |
+
setp.gt.s64 %p22, %rd20, 1;
|
| 240 |
+
.loc 1 43 66 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:66
|
| 241 |
+
selp.b64 %rd83, %rd20, 0, %p22;
|
| 242 |
+
.loc 1 43 0 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43
|
| 243 |
+
selp.b64 %rd84, 1, 0, %p21;
|
| 244 |
+
.loc 1 43 57 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:57
|
| 245 |
+
add.s64 %rd85, %rd83, %rd84;
|
| 246 |
+
.loc 1 43 34 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:34
|
| 247 |
+
mul.lo.s64 %rd86, %rd91, %rd85;
|
| 248 |
+
.loc 1 43 25 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:25
|
| 249 |
+
add.s64 %rd88, %rd19, %rd90;
|
| 250 |
+
shl.b64 %rd89, %rd86, 2;
|
| 251 |
+
add.s64 %rd73, %rd88, %rd89;
|
| 252 |
+
.loc 1 43 88 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:88
|
| 253 |
+
and.b32 %r47, %r1, 96;
|
| 254 |
+
setp.eq.b32 %p23, %r47, 0;
|
| 255 |
+
and.pred %p19, %p23, %p20;
|
| 256 |
+
// begin inline asm
|
| 257 |
+
@%p19 st.global.b32 [ %rd73 + 0 ], { %r31 };
|
| 258 |
+
// end inline asm
|
| 259 |
+
.loc 1 43 4 // cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py:43:4
|
| 260 |
+
ret;
|
| 261 |
+
$L__tmp5:
|
| 262 |
+
$L__func_end0:
|
| 263 |
+
// -- End function
|
| 264 |
+
}
|
| 265 |
+
.file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py"
|
| 266 |
+
.file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py"
|
| 267 |
+
.section .debug_abbrev
|
| 268 |
+
{
|
| 269 |
+
.b8 1 // Abbreviation Code
|
| 270 |
+
.b8 17 // DW_TAG_compile_unit
|
| 271 |
+
.b8 1 // DW_CHILDREN_yes
|
| 272 |
+
.b8 37 // DW_AT_producer
|
| 273 |
+
.b8 8 // DW_FORM_string
|
| 274 |
+
.b8 19 // DW_AT_language
|
| 275 |
+
.b8 5 // DW_FORM_data2
|
| 276 |
+
.b8 3 // DW_AT_name
|
| 277 |
+
.b8 8 // DW_FORM_string
|
| 278 |
+
.b8 16 // DW_AT_stmt_list
|
| 279 |
+
.b8 6 // DW_FORM_data4
|
| 280 |
+
.b8 27 // DW_AT_comp_dir
|
| 281 |
+
.b8 8 // DW_FORM_string
|
| 282 |
+
.b8 0 // EOM(1)
|
| 283 |
+
.b8 0 // EOM(2)
|
| 284 |
+
.b8 2 // Abbreviation Code
|
| 285 |
+
.b8 46 // DW_TAG_subprogram
|
| 286 |
+
.b8 0 // DW_CHILDREN_no
|
| 287 |
+
.b8 3 // DW_AT_name
|
| 288 |
+
.b8 8 // DW_FORM_string
|
| 289 |
+
.b8 32 // DW_AT_inline
|
| 290 |
+
.b8 11 // DW_FORM_data1
|
| 291 |
+
.b8 0 // EOM(1)
|
| 292 |
+
.b8 0 // EOM(2)
|
| 293 |
+
.b8 3 // Abbreviation Code
|
| 294 |
+
.b8 46 // DW_TAG_subprogram
|
| 295 |
+
.b8 1 // DW_CHILDREN_yes
|
| 296 |
+
.b8 17 // DW_AT_low_pc
|
| 297 |
+
.b8 1 // DW_FORM_addr
|
| 298 |
+
.b8 18 // DW_AT_high_pc
|
| 299 |
+
.b8 1 // DW_FORM_addr
|
| 300 |
+
.b8 49 // DW_AT_abstract_origin
|
| 301 |
+
.b8 19 // DW_FORM_ref4
|
| 302 |
+
.b8 0 // EOM(1)
|
| 303 |
+
.b8 0 // EOM(2)
|
| 304 |
+
.b8 4 // Abbreviation Code
|
| 305 |
+
.b8 29 // DW_TAG_inlined_subroutine
|
| 306 |
+
.b8 0 // DW_CHILDREN_no
|
| 307 |
+
.b8 49 // DW_AT_abstract_origin
|
| 308 |
+
.b8 19 // DW_FORM_ref4
|
| 309 |
+
.b8 17 // DW_AT_low_pc
|
| 310 |
+
.b8 1 // DW_FORM_addr
|
| 311 |
+
.b8 18 // DW_AT_high_pc
|
| 312 |
+
.b8 1 // DW_FORM_addr
|
| 313 |
+
.b8 88 // DW_AT_call_file
|
| 314 |
+
.b8 11 // DW_FORM_data1
|
| 315 |
+
.b8 89 // DW_AT_call_line
|
| 316 |
+
.b8 11 // DW_FORM_data1
|
| 317 |
+
.b8 87 // DW_AT_call_column
|
| 318 |
+
.b8 11 // DW_FORM_data1
|
| 319 |
+
.b8 0 // EOM(1)
|
| 320 |
+
.b8 0 // EOM(2)
|
| 321 |
+
.b8 0 // EOM(3)
|
| 322 |
+
}
|
| 323 |
+
.section .debug_info
|
| 324 |
+
{
|
| 325 |
+
.b32 238 // Length of Unit
|
| 326 |
+
.b8 2 // DWARF version number
|
| 327 |
+
.b8 0
|
| 328 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 329 |
+
.b8 8 // Address Size (in bytes)
|
| 330 |
+
.b8 1 // Abbrev [1] 0xb:0xe7 DW_TAG_compile_unit
|
| 331 |
+
.b8 116 // DW_AT_producer
|
| 332 |
+
.b8 114
|
| 333 |
+
.b8 105
|
| 334 |
+
.b8 116
|
| 335 |
+
.b8 111
|
| 336 |
+
.b8 110
|
| 337 |
+
.b8 0
|
| 338 |
+
.b8 2 // DW_AT_language
|
| 339 |
+
.b8 0
|
| 340 |
+
.b8 99 // DW_AT_name
|
| 341 |
+
.b8 120
|
| 342 |
+
.b8 116
|
| 343 |
+
.b8 109
|
| 344 |
+
.b8 114
|
| 345 |
+
.b8 99
|
| 346 |
+
.b8 110
|
| 347 |
+
.b8 54
|
| 348 |
+
.b8 110
|
| 349 |
+
.b8 100
|
| 350 |
+
.b8 103
|
| 351 |
+
.b8 104
|
| 352 |
+
.b8 117
|
| 353 |
+
.b8 107
|
| 354 |
+
.b8 102
|
| 355 |
+
.b8 104
|
| 356 |
+
.b8 52
|
| 357 |
+
.b8 50
|
| 358 |
+
.b8 99
|
| 359 |
+
.b8 108
|
| 360 |
+
.b8 117
|
| 361 |
+
.b8 122
|
| 362 |
+
.b8 105
|
| 363 |
+
.b8 119
|
| 364 |
+
.b8 101
|
| 365 |
+
.b8 111
|
| 366 |
+
.b8 97
|
| 367 |
+
.b8 122
|
| 368 |
+
.b8 102
|
| 369 |
+
.b8 109
|
| 370 |
+
.b8 102
|
| 371 |
+
.b8 114
|
| 372 |
+
.b8 122
|
| 373 |
+
.b8 51
|
| 374 |
+
.b8 106
|
| 375 |
+
.b8 101
|
| 376 |
+
.b8 111
|
| 377 |
+
.b8 112
|
| 378 |
+
.b8 99
|
| 379 |
+
.b8 114
|
| 380 |
+
.b8 114
|
| 381 |
+
.b8 111
|
| 382 |
+
.b8 109
|
| 383 |
+
.b8 119
|
| 384 |
+
.b8 118
|
| 385 |
+
.b8 106
|
| 386 |
+
.b8 53
|
| 387 |
+
.b8 109
|
| 388 |
+
.b8 51
|
| 389 |
+
.b8 108
|
| 390 |
+
.b8 115
|
| 391 |
+
.b8 106
|
| 392 |
+
.b8 46
|
| 393 |
+
.b8 112
|
| 394 |
+
.b8 121
|
| 395 |
+
.b8 0
|
| 396 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 397 |
+
.b8 47 // DW_AT_comp_dir
|
| 398 |
+
.b8 119
|
| 399 |
+
.b8 111
|
| 400 |
+
.b8 114
|
| 401 |
+
.b8 107
|
| 402 |
+
.b8 115
|
| 403 |
+
.b8 112
|
| 404 |
+
.b8 97
|
| 405 |
+
.b8 99
|
| 406 |
+
.b8 101
|
| 407 |
+
.b8 47
|
| 408 |
+
.b8 104
|
| 409 |
+
.b8 97
|
| 410 |
+
.b8 110
|
| 411 |
+
.b8 114
|
| 412 |
+
.b8 117
|
| 413 |
+
.b8 105
|
| 414 |
+
.b8 47
|
| 415 |
+
.b8 83
|
| 416 |
+
.b8 112
|
| 417 |
+
.b8 101
|
| 418 |
+
.b8 99
|
| 419 |
+
.b8 70
|
| 420 |
+
.b8 111
|
| 421 |
+
.b8 114
|
| 422 |
+
.b8 103
|
| 423 |
+
.b8 101
|
| 424 |
+
.b8 45
|
| 425 |
+
.b8 101
|
| 426 |
+
.b8 120
|
| 427 |
+
.b8 116
|
| 428 |
+
.b8 47
|
| 429 |
+
.b8 99
|
| 430 |
+
.b8 97
|
| 431 |
+
.b8 99
|
| 432 |
+
.b8 104
|
| 433 |
+
.b8 101
|
| 434 |
+
.b8 47
|
| 435 |
+
.b8 99
|
| 436 |
+
.b8 111
|
| 437 |
+
.b8 109
|
| 438 |
+
.b8 112
|
| 439 |
+
.b8 105
|
| 440 |
+
.b8 108
|
| 441 |
+
.b8 101
|
| 442 |
+
.b8 100
|
| 443 |
+
.b8 95
|
| 444 |
+
.b8 107
|
| 445 |
+
.b8 101
|
| 446 |
+
.b8 114
|
| 447 |
+
.b8 110
|
| 448 |
+
.b8 101
|
| 449 |
+
.b8 108
|
| 450 |
+
.b8 115
|
| 451 |
+
.b8 47
|
| 452 |
+
.b8 120
|
| 453 |
+
.b8 116
|
| 454 |
+
.b8 0
|
| 455 |
+
.b8 2 // Abbrev [2] 0x8b:0x38 DW_TAG_subprogram
|
| 456 |
+
.b8 116 // DW_AT_name
|
| 457 |
+
.b8 114
|
| 458 |
+
.b8 105
|
| 459 |
+
.b8 116
|
| 460 |
+
.b8 111
|
| 461 |
+
.b8 110
|
| 462 |
+
.b8 95
|
| 463 |
+
.b8 114
|
| 464 |
+
.b8 101
|
| 465 |
+
.b8 100
|
| 466 |
+
.b8 95
|
| 467 |
+
.b8 102
|
| 468 |
+
.b8 117
|
| 469 |
+
.b8 115
|
| 470 |
+
.b8 101
|
| 471 |
+
.b8 100
|
| 472 |
+
.b8 95
|
| 473 |
+
.b8 95
|
| 474 |
+
.b8 116
|
| 475 |
+
.b8 111
|
| 476 |
+
.b8 95
|
| 477 |
+
.b8 99
|
| 478 |
+
.b8 111
|
| 479 |
+
.b8 112
|
| 480 |
+
.b8 121
|
| 481 |
+
.b8 95
|
| 482 |
+
.b8 99
|
| 483 |
+
.b8 108
|
| 484 |
+
.b8 111
|
| 485 |
+
.b8 110
|
| 486 |
+
.b8 101
|
| 487 |
+
.b8 95
|
| 488 |
+
.b8 115
|
| 489 |
+
.b8 108
|
| 490 |
+
.b8 105
|
| 491 |
+
.b8 99
|
| 492 |
+
.b8 101
|
| 493 |
+
.b8 95
|
| 494 |
+
.b8 115
|
| 495 |
+
.b8 117
|
| 496 |
+
.b8 109
|
| 497 |
+
.b8 95
|
| 498 |
+
.b8 116
|
| 499 |
+
.b8 114
|
| 500 |
+
.b8 97
|
| 501 |
+
.b8 110
|
| 502 |
+
.b8 115
|
| 503 |
+
.b8 112
|
| 504 |
+
.b8 111
|
| 505 |
+
.b8 115
|
| 506 |
+
.b8 101
|
| 507 |
+
.b8 95
|
| 508 |
+
.b8 53
|
| 509 |
+
.b8 0
|
| 510 |
+
.b8 1 // DW_AT_inline
|
| 511 |
+
.b8 3 // Abbrev [3] 0xc3:0x2e DW_TAG_subprogram
|
| 512 |
+
.b64 $L__func_begin0 // DW_AT_low_pc
|
| 513 |
+
.b64 $L__func_end0 // DW_AT_high_pc
|
| 514 |
+
.b32 139 // DW_AT_abstract_origin
|
| 515 |
+
.b8 4 // Abbrev [4] 0xd8:0x18 DW_TAG_inlined_subroutine
|
| 516 |
+
.b32 139 // DW_AT_abstract_origin
|
| 517 |
+
.b64 $L__tmp1 // DW_AT_low_pc
|
| 518 |
+
.b64 $L__tmp4 // DW_AT_high_pc
|
| 519 |
+
.b8 1 // DW_AT_call_file
|
| 520 |
+
.b8 41 // DW_AT_call_line
|
| 521 |
+
.b8 25 // DW_AT_call_column
|
| 522 |
+
.b8 0 // End Of Children Mark
|
| 523 |
+
.b8 0 // End Of Children Mark
|
| 524 |
+
}
|
| 525 |
+
.section .debug_macinfo { }
|
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":18:0)
|
| 2 |
+
#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
|
| 3 |
+
#loc43 = loc(unknown)
|
| 4 |
+
#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
|
| 5 |
+
#loc50 = loc("in_ptr0"(#loc))
|
| 6 |
+
#loc51 = loc("out_ptr1"(#loc))
|
| 7 |
+
#loc52 = loc("ks0"(#loc))
|
| 8 |
+
#loc53 = loc("ks1"(#loc))
|
| 9 |
+
#loc54 = loc("xnumel"(#loc))
|
| 10 |
+
#loc55 = loc("r0_numel"(#loc))
|
| 11 |
+
#loc85 = loc("input"(#loc41))
|
| 12 |
+
#loc86 = loc("a"(#loc46))
|
| 13 |
+
#loc87 = loc("b"(#loc46))
|
| 14 |
+
module {
|
| 15 |
+
tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 16 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc56)
|
| 17 |
+
%xoffset_0 = arith.constant 32 : i32 loc(#loc57)
|
| 18 |
+
%xoffset_1 = arith.constant 32 : i32 loc(#loc57)
|
| 19 |
+
%xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc57)
|
| 20 |
+
%xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc58)
|
| 21 |
+
%xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc59)
|
| 22 |
+
%xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<32x1xi32> loc(#loc60)
|
| 23 |
+
%xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<32x1xi32> loc(#loc60)
|
| 24 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32> loc(#loc61)
|
| 25 |
+
%xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<32x1xi32> loc(#loc61)
|
| 26 |
+
%r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc62)
|
| 27 |
+
%r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc63)
|
| 28 |
+
%x0 = arith.extsi %xindex_5 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc64)
|
| 29 |
+
%x0_8 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc64)
|
| 30 |
+
%x0_9 = arith.remsi %x0, %x0_8 : tensor<32x1xi64> loc(#loc64)
|
| 31 |
+
%x1 = arith.extsi %xindex_5 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc65)
|
| 32 |
+
%x1_10 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc65)
|
| 33 |
+
%x1_11 = arith.divsi %x1, %x1_10 : tensor<32x1xi64> loc(#loc65)
|
| 34 |
+
%_tmp3 = arith.constant 0 : i64 loc(#loc66)
|
| 35 |
+
%_tmp3_12 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc66)
|
| 36 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc12)
|
| 37 |
+
%c16_i32 = arith.constant 16 : i32 loc(#loc12)
|
| 38 |
+
%0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12)
|
| 39 |
+
%1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc12)
|
| 40 |
+
%2 = arith.bitcast %c16_i32 : i32 to i32 loc(#loc12)
|
| 41 |
+
%3 = ub.poison : i32 loc(#loc12)
|
| 42 |
+
%_tmp3_13 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_18 = %_tmp3_12) -> (tensor<32x16xi64>) : i32 {
|
| 43 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc68)
|
| 44 |
+
%r0_index_19 = arith.addi %r0_index, %r0_base_7 : tensor<1x16xi32> loc(#loc68)
|
| 45 |
+
%r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc69)
|
| 46 |
+
%r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32> loc(#loc69)
|
| 47 |
+
%tmp0 = arith.extsi %r0_index_19 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc70)
|
| 48 |
+
%tmp0_21 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc70)
|
| 49 |
+
%tmp0_22 = arith.muli %tmp0_21, %tmp0 : tensor<1x16xi64> loc(#loc70)
|
| 50 |
+
%tmp0_23 = tt.broadcast %x0_9 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc71)
|
| 51 |
+
%tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x16xi64> -> tensor<32x16xi64> loc(#loc71)
|
| 52 |
+
%tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<32x16xi64> loc(#loc71)
|
| 53 |
+
%tmp0_26 = arith.muli %ks0, %ks1 : i64 loc(#loc72)
|
| 54 |
+
%tmp0_27 = tt.splat %tmp0_26 : i64 -> tensor<32x1xi64> loc(#loc73)
|
| 55 |
+
%tmp0_28 = arith.muli %tmp0_27, %x1_11 : tensor<32x1xi64> loc(#loc73)
|
| 56 |
+
%tmp0_29 = tt.broadcast %tmp0_28 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc74)
|
| 57 |
+
%tmp0_30 = arith.addi %tmp0_25, %tmp0_29 : tensor<32x16xi64> loc(#loc74)
|
| 58 |
+
%tmp0_31 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>> loc(#loc75)
|
| 59 |
+
%tmp0_32 = tt.addptr %tmp0_31, %tmp0_30 : tensor<32x16x!tt.ptr<i32>>, tensor<32x16xi64> loc(#loc75)
|
| 60 |
+
%tmp0_33 = tt.broadcast %r0_mask_20 : tensor<1x16xi1> -> tensor<32x16xi1> loc(#loc76)
|
| 61 |
+
%tmp0_34 = tt.broadcast %xmask_6 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc76)
|
| 62 |
+
%tmp0_35 = arith.andi %tmp0_33, %tmp0_34 : tensor<32x16xi1> loc(#loc76)
|
| 63 |
+
%tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc77)
|
| 64 |
+
%tmp0_37 = arith.constant dense<0.000000e+00> : tensor<32x16xf32> loc(#loc77)
|
| 65 |
+
%tmp0_38 = arith.fptosi %tmp0_37 : tensor<32x16xf32> to tensor<32x16xi32> loc(#loc77)
|
| 66 |
+
%tmp0_39 = tt.load %tmp0_32, %tmp0_35, %tmp0_38 evictionPolicy = evict_last : tensor<32x16x!tt.ptr<i32>> loc(#loc77)
|
| 67 |
+
%tmp1 = arith.extsi %tmp0_39 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc78)
|
| 68 |
+
%tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<32x16xi64> loc(#loc79)
|
| 69 |
+
%_tmp3_40 = tt.broadcast %r0_mask_20 : tensor<1x16xi1> -> tensor<32x16xi1> loc(#loc80)
|
| 70 |
+
%_tmp3_41 = tt.broadcast %xmask_6 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc80)
|
| 71 |
+
%_tmp3_42 = arith.andi %_tmp3_40, %_tmp3_41 : tensor<32x16xi1> loc(#loc80)
|
| 72 |
+
%_tmp3_43 = arith.select %_tmp3_42, %tmp4, %_tmp3_18 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc81)
|
| 73 |
+
scf.yield %_tmp3_43 : tensor<32x16xi64> loc(#loc27)
|
| 74 |
+
} loc(#loc67)
|
| 75 |
+
%tmp3 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_13) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc82)
|
| 76 |
+
%tmp3_14 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc83)
|
| 77 |
+
%tmp5 = arith.trunci %tmp3_14 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc84)
|
| 78 |
+
%c1_i32 = arith.constant 1 : i32 loc(#loc31)
|
| 79 |
+
%4 = arith.extsi %c1_i32 : i32 to i64 loc(#loc31)
|
| 80 |
+
%5 = arith.cmpi sge, %4, %ks0 : i64 loc(#loc31)
|
| 81 |
+
%c1_i32_15 = arith.constant 1 : i32 loc(#loc32)
|
| 82 |
+
%c1_i32_16 = arith.constant 1 : i32 loc(#loc32)
|
| 83 |
+
%6 = arith.extui %5 : i1 to i32 loc(#loc32)
|
| 84 |
+
%7 = arith.muli %c1_i32_16, %6 : i32 loc(#loc32)
|
| 85 |
+
%c1_i32_17 = arith.constant 1 : i32 loc(#loc33)
|
| 86 |
+
%8 = arith.extsi %c1_i32_17 : i32 to i64 loc(#loc33)
|
| 87 |
+
%9 = arith.cmpi sgt, %ks0, %8 : i64 loc(#loc33)
|
| 88 |
+
%10 = arith.extui %9 : i1 to i64 loc(#loc34)
|
| 89 |
+
%11 = arith.muli %ks0, %10 : i64 loc(#loc34)
|
| 90 |
+
%12 = arith.extsi %7 : i32 to i64 loc(#loc35)
|
| 91 |
+
%13 = arith.addi %12, %11 : i64 loc(#loc35)
|
| 92 |
+
%14 = tt.splat %13 : i64 -> tensor<32x1xi64> loc(#loc36)
|
| 93 |
+
%15 = arith.muli %x1_11, %14 : tensor<32x1xi64> loc(#loc36)
|
| 94 |
+
%16 = arith.addi %x0_9, %15 : tensor<32x1xi64> loc(#loc37)
|
| 95 |
+
%17 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>> loc(#loc38)
|
| 96 |
+
%18 = tt.addptr %17, %16 : tensor<32x1x!tt.ptr<i32>>, tensor<32x1xi64> loc(#loc38)
|
| 97 |
+
tt.store %18, %tmp5, %xmask_6 : tensor<32x1x!tt.ptr<i32>> loc(#loc39)
|
| 98 |
+
tt.return loc(#loc40)
|
| 99 |
+
} loc(#loc)
|
| 100 |
+
tt.func private @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x16xi64> loc("input"(#loc41))) -> tensor<32xi64> attributes {noinline = false} {
|
| 101 |
+
%0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
|
| 102 |
+
^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)):
|
| 103 |
+
%2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc42)
|
| 104 |
+
tt.reduce.return %2 : i64 loc(#loc42)
|
| 105 |
+
}) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc42)
|
| 106 |
+
tt.return %0 : tensor<32xi64> loc(#loc44)
|
| 107 |
+
^bb1: // no predecessors
|
| 108 |
+
%1 = ub.poison : tensor<32xi64> loc(#loc45)
|
| 109 |
+
tt.return %1 : tensor<32xi64> loc(#loc45)
|
| 110 |
+
} loc(#loc41)
|
| 111 |
+
tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc46)), %b: i64 loc("b"(#loc46))) -> i64 attributes {noinline = false} {
|
| 112 |
+
%0 = arith.addi %a, %b : i64 loc(#loc47)
|
| 113 |
+
tt.return %0 : i64 loc(#loc48)
|
| 114 |
+
^bb1: // no predecessors
|
| 115 |
+
%1 = ub.poison : i64 loc(#loc49)
|
| 116 |
+
tt.return %1 : i64 loc(#loc49)
|
| 117 |
+
} loc(#loc46)
|
| 118 |
+
} loc(#loc)
|
| 119 |
+
#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:28)
|
| 120 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:33)
|
| 121 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:36)
|
| 122 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:44)
|
| 123 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:23)
|
| 124 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":23:21)
|
| 125 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":24:27)
|
| 126 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":24:37)
|
| 127 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":26:19)
|
| 128 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":27:19)
|
| 129 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":28:43)
|
| 130 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":30:40)
|
| 131 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":31:31)
|
| 132 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":32:29)
|
| 133 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:43)
|
| 134 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:39)
|
| 135 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:54)
|
| 136 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:58)
|
| 137 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:50)
|
| 138 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:34)
|
| 139 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:73)
|
| 140 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:63)
|
| 141 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":37:23)
|
| 142 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":39:23)
|
| 143 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:35)
|
| 144 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:48)
|
| 145 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:8)
|
| 146 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:25)
|
| 147 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:28)
|
| 148 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":42:19)
|
| 149 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:49)
|
| 150 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:41)
|
| 151 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:75)
|
| 152 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:66)
|
| 153 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:57)
|
| 154 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:34)
|
| 155 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:30)
|
| 156 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:25)
|
| 157 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:88)
|
| 158 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:4)
|
| 159 |
+
#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 160 |
+
#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
|
| 161 |
+
#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
|
| 162 |
+
#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 163 |
+
#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
|
| 164 |
+
#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
|
| 165 |
+
#loc56 = loc("xoffset"(#loc1))
|
| 166 |
+
#loc57 = loc("xoffset"(#loc2))
|
| 167 |
+
#loc58 = loc("xindex"(#loc3))
|
| 168 |
+
#loc59 = loc("xindex"(#loc4))
|
| 169 |
+
#loc60 = loc("xindex"(#loc5))
|
| 170 |
+
#loc61 = loc("xmask"(#loc6))
|
| 171 |
+
#loc62 = loc("r0_base"(#loc7))
|
| 172 |
+
#loc63 = loc("r0_base"(#loc8))
|
| 173 |
+
#loc64 = loc("x0"(#loc9))
|
| 174 |
+
#loc65 = loc("x1"(#loc10))
|
| 175 |
+
#loc66 = loc("_tmp3"(#loc11))
|
| 176 |
+
#loc67 = loc("_tmp3"(#loc12))
|
| 177 |
+
#loc68 = loc("r0_index"(#loc13))
|
| 178 |
+
#loc69 = loc("r0_mask"(#loc14))
|
| 179 |
+
#loc70 = loc("tmp0"(#loc15))
|
| 180 |
+
#loc71 = loc("tmp0"(#loc16))
|
| 181 |
+
#loc72 = loc("tmp0"(#loc17))
|
| 182 |
+
#loc73 = loc("tmp0"(#loc18))
|
| 183 |
+
#loc74 = loc("tmp0"(#loc19))
|
| 184 |
+
#loc75 = loc("tmp0"(#loc20))
|
| 185 |
+
#loc76 = loc("tmp0"(#loc21))
|
| 186 |
+
#loc77 = loc("tmp0"(#loc22))
|
| 187 |
+
#loc78 = loc("tmp1"(#loc23))
|
| 188 |
+
#loc79 = loc("tmp4"(#loc24))
|
| 189 |
+
#loc80 = loc("_tmp3"(#loc25))
|
| 190 |
+
#loc81 = loc("_tmp3"(#loc26))
|
| 191 |
+
#loc82 = loc("tmp3"(#loc28))
|
| 192 |
+
#loc83 = loc("tmp3"(#loc29))
|
| 193 |
+
#loc84 = loc("tmp5"(#loc30))
|
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
|
| 2 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":18:0)
|
| 3 |
+
#loc1 = loc(unknown)
|
| 4 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:25)
|
| 5 |
+
#loc40 = loc("in_ptr0"(#loc))
|
| 6 |
+
#loc41 = loc("out_ptr1"(#loc))
|
| 7 |
+
#loc42 = loc("ks0"(#loc))
|
| 8 |
+
#loc43 = loc("ks1"(#loc))
|
| 9 |
+
#loc44 = loc("xnumel"(#loc))
|
| 10 |
+
#loc45 = loc("r0_numel"(#loc))
|
| 11 |
+
#loc68 = loc("tmp3"(#loc26))
|
| 12 |
+
#loc73 = loc(callsite(#loc1 at #loc68))
|
| 13 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 14 |
+
tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 15 |
+
%cst = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1)
|
| 16 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
|
| 17 |
+
%c16_i32 = arith.constant 16 : i32 loc(#loc1)
|
| 18 |
+
%c1_i64 = arith.constant 1 : i64 loc(#loc1)
|
| 19 |
+
%c32_i32 = arith.constant 32 : i32 loc(#loc1)
|
| 20 |
+
%cst_0 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1)
|
| 21 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc46)
|
| 22 |
+
%xoffset_1 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc47)
|
| 23 |
+
%xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48)
|
| 24 |
+
%xindex_2 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc48)
|
| 25 |
+
%xindex_3 = tt.splat %xoffset_1 : i32 -> tensor<32x1xi32, #blocked> loc(#loc49)
|
| 26 |
+
%xindex_4 = arith.addi %xindex_3, %xindex_2 : tensor<32x1xi32, #blocked> loc(#loc49)
|
| 27 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32, #blocked> loc(#loc50)
|
| 28 |
+
%xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<32x1xi32, #blocked> loc(#loc50)
|
| 29 |
+
%r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc51)
|
| 30 |
+
%r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc51)
|
| 31 |
+
%x0 = arith.extsi %xindex_4 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> loc(#loc52)
|
| 32 |
+
%x0_7 = tt.splat %ks0 : i64 -> tensor<32x1xi64, #blocked> loc(#loc52)
|
| 33 |
+
%x0_8 = arith.remsi %x0, %x0_7 : tensor<32x1xi64, #blocked> loc(#loc52)
|
| 34 |
+
%x1 = arith.divsi %x0, %x0_7 : tensor<32x1xi64, #blocked> loc(#loc53)
|
| 35 |
+
%r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32, #blocked> loc(#loc54)
|
| 36 |
+
%tmp0 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked> loc(#loc55)
|
| 37 |
+
%tmp0_9 = tt.broadcast %x0_8 : tensor<32x1xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc56)
|
| 38 |
+
%tmp0_10 = arith.muli %ks0, %ks1 : i64 loc(#loc57)
|
| 39 |
+
%tmp0_11 = tt.splat %tmp0_10 : i64 -> tensor<32x1xi64, #blocked> loc(#loc58)
|
| 40 |
+
%tmp0_12 = arith.muli %tmp0_11, %x1 : tensor<32x1xi64, #blocked> loc(#loc58)
|
| 41 |
+
%tmp0_13 = tt.broadcast %tmp0_12 : tensor<32x1xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc59)
|
| 42 |
+
%tmp0_14 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>, #blocked> loc(#loc60)
|
| 43 |
+
%tmp0_15 = tt.broadcast %xmask_5 : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc61)
|
| 44 |
+
%_tmp3 = scf.for %_tmp3_17 = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_18 = %cst) -> (tensor<32x16xi64, #blocked>) : i32 {
|
| 45 |
+
%r0_index = tt.splat %_tmp3_17 : i32 -> tensor<1x16xi32, #blocked> loc(#loc63)
|
| 46 |
+
%r0_index_19 = arith.addi %r0_index, %r0_base_6 : tensor<1x16xi32, #blocked> loc(#loc63)
|
| 47 |
+
%r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32, #blocked> loc(#loc54)
|
| 48 |
+
%tmp0_21 = arith.extsi %r0_index_19 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc55)
|
| 49 |
+
%tmp0_22 = arith.muli %tmp0, %tmp0_21 : tensor<1x16xi64, #blocked> loc(#loc55)
|
| 50 |
+
%tmp0_23 = tt.broadcast %tmp0_22 : tensor<1x16xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc56)
|
| 51 |
+
%tmp0_24 = arith.addi %tmp0_9, %tmp0_23 : tensor<32x16xi64, #blocked> loc(#loc56)
|
| 52 |
+
%tmp0_25 = arith.addi %tmp0_24, %tmp0_13 : tensor<32x16xi64, #blocked> loc(#loc59)
|
| 53 |
+
%tmp0_26 = tt.addptr %tmp0_14, %tmp0_25 : tensor<32x16x!tt.ptr<i32>, #blocked>, tensor<32x16xi64, #blocked> loc(#loc60)
|
| 54 |
+
%tmp0_27 = tt.broadcast %r0_mask_20 : tensor<1x16xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc61)
|
| 55 |
+
%tmp0_28 = arith.andi %tmp0_27, %tmp0_15 : tensor<32x16xi1, #blocked> loc(#loc61)
|
| 56 |
+
%tmp0_29 = tt.load %tmp0_26, %tmp0_28, %cst_0 evictionPolicy = evict_last : tensor<32x16x!tt.ptr<i32>, #blocked> loc(#loc64)
|
| 57 |
+
%tmp1 = arith.extsi %tmp0_29 : tensor<32x16xi32, #blocked> to tensor<32x16xi64, #blocked> loc(#loc65)
|
| 58 |
+
%tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<32x16xi64, #blocked> loc(#loc66)
|
| 59 |
+
%_tmp3_30 = arith.select %tmp0_28, %tmp4, %_tmp3_18 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc67)
|
| 60 |
+
scf.yield %_tmp3_30 : tensor<32x16xi64, #blocked> loc(#loc24)
|
| 61 |
+
} loc(#loc62)
|
| 62 |
+
%tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({
|
| 63 |
+
^bb0(%tmp3_17: i64 loc(callsite(#loc1 at #loc68)), %tmp3_18: i64 loc(callsite(#loc1 at #loc68))):
|
| 64 |
+
%tmp3_19 = arith.addi %tmp3_17, %tmp3_18 : i64 loc(#loc74)
|
| 65 |
+
tt.reduce.return %tmp3_19 : i64 loc(#loc72)
|
| 66 |
+
}) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc72)
|
| 67 |
+
%tmp3_16 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc69)
|
| 68 |
+
%tmp5 = arith.trunci %tmp3_16 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc70)
|
| 69 |
+
%0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc30)
|
| 70 |
+
%1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc31)
|
| 71 |
+
%2 = arith.extui %1 : i1 to i64 loc(#loc32)
|
| 72 |
+
%3 = arith.muli %ks0, %2 : i64 loc(#loc32)
|
| 73 |
+
%4 = arith.extui %0 : i1 to i64 loc(#loc71)
|
| 74 |
+
%5 = arith.addi %4, %3 : i64 loc(#loc33)
|
| 75 |
+
%6 = tt.splat %5 : i64 -> tensor<32x1xi64, #blocked> loc(#loc35)
|
| 76 |
+
%7 = arith.muli %x1, %6 : tensor<32x1xi64, #blocked> loc(#loc35)
|
| 77 |
+
%8 = arith.addi %x0_8, %7 : tensor<32x1xi64, #blocked> loc(#loc36)
|
| 78 |
+
%9 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>, #blocked> loc(#loc37)
|
| 79 |
+
%10 = tt.addptr %9, %8 : tensor<32x1x!tt.ptr<i32>, #blocked>, tensor<32x1xi64, #blocked> loc(#loc37)
|
| 80 |
+
tt.store %10, %tmp5, %xmask_5 : tensor<32x1x!tt.ptr<i32>, #blocked> loc(#loc38)
|
| 81 |
+
tt.return loc(#loc39)
|
| 82 |
+
} loc(#loc)
|
| 83 |
+
} loc(#loc)
|
| 84 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:28)
|
| 85 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:33)
|
| 86 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:44)
|
| 87 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:23)
|
| 88 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":23:21)
|
| 89 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":24:37)
|
| 90 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":26:19)
|
| 91 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":27:19)
|
| 92 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":32:29)
|
| 93 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:43)
|
| 94 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:39)
|
| 95 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:54)
|
| 96 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:58)
|
| 97 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:50)
|
| 98 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:34)
|
| 99 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:73)
|
| 100 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":30:40)
|
| 101 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":31:31)
|
| 102 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:63)
|
| 103 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":37:23)
|
| 104 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":39:23)
|
| 105 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:48)
|
| 106 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:8)
|
| 107 |
+
#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 108 |
+
#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 109 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:28)
|
| 110 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":42:19)
|
| 111 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:49)
|
| 112 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:75)
|
| 113 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:66)
|
| 114 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:57)
|
| 115 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:41)
|
| 116 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:34)
|
| 117 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:30)
|
| 118 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:25)
|
| 119 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:88)
|
| 120 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:4)
|
| 121 |
+
#loc46 = loc("xoffset"(#loc2))
|
| 122 |
+
#loc47 = loc("xoffset"(#loc3))
|
| 123 |
+
#loc48 = loc("xindex"(#loc4))
|
| 124 |
+
#loc49 = loc("xindex"(#loc5))
|
| 125 |
+
#loc50 = loc("xmask"(#loc6))
|
| 126 |
+
#loc51 = loc("r0_base"(#loc7))
|
| 127 |
+
#loc52 = loc("x0"(#loc8))
|
| 128 |
+
#loc53 = loc("x1"(#loc9))
|
| 129 |
+
#loc54 = loc("r0_mask"(#loc10))
|
| 130 |
+
#loc55 = loc("tmp0"(#loc11))
|
| 131 |
+
#loc56 = loc("tmp0"(#loc12))
|
| 132 |
+
#loc57 = loc("tmp0"(#loc13))
|
| 133 |
+
#loc58 = loc("tmp0"(#loc14))
|
| 134 |
+
#loc59 = loc("tmp0"(#loc15))
|
| 135 |
+
#loc60 = loc("tmp0"(#loc16))
|
| 136 |
+
#loc61 = loc("tmp0"(#loc17))
|
| 137 |
+
#loc62 = loc("_tmp3"(#loc18))
|
| 138 |
+
#loc63 = loc("r0_index"(#loc19))
|
| 139 |
+
#loc64 = loc("tmp0"(#loc20))
|
| 140 |
+
#loc65 = loc("tmp1"(#loc21))
|
| 141 |
+
#loc66 = loc("tmp4"(#loc22))
|
| 142 |
+
#loc67 = loc("_tmp3"(#loc23))
|
| 143 |
+
#loc69 = loc("tmp3"(#loc28))
|
| 144 |
+
#loc70 = loc("tmp5"(#loc29))
|
| 145 |
+
#loc71 = loc(fused[#loc33, #loc34])
|
| 146 |
+
#loc72 = loc(callsite(#loc25 at #loc68))
|
| 147 |
+
#loc74 = loc(callsite(#loc27 at #loc72))
|
SpecForge-ext/cache/compiled_kernels/triton/3/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":18:0)
|
| 2 |
+
#loc1 = loc(unknown)
|
| 3 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:25)
|
| 4 |
+
#loc43 = loc("in_ptr0"(#loc))
|
| 5 |
+
#loc44 = loc("out_ptr1"(#loc))
|
| 6 |
+
#loc45 = loc("ks0"(#loc))
|
| 7 |
+
#loc46 = loc("ks1"(#loc))
|
| 8 |
+
#loc47 = loc("xnumel"(#loc))
|
| 9 |
+
#loc48 = loc("r0_numel"(#loc))
|
| 10 |
+
#loc74 = loc("tmp3"(#loc29))
|
| 11 |
+
#loc79 = loc(callsite(#loc1 at #loc74))
|
| 12 |
+
module {
|
| 13 |
+
tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 14 |
+
%c1_i64 = arith.constant 1 : i64 loc(#loc1)
|
| 15 |
+
%cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1)
|
| 16 |
+
%c16_i32 = arith.constant 16 : i32 loc(#loc2)
|
| 17 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc2)
|
| 18 |
+
%_tmp3 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc49)
|
| 19 |
+
%c32_i32 = arith.constant 32 : i32 loc(#loc1)
|
| 20 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc50)
|
| 21 |
+
%xoffset_0 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc51)
|
| 22 |
+
%xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc52)
|
| 23 |
+
%xindex_1 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc53)
|
| 24 |
+
%xindex_2 = tt.splat %xoffset_0 : i32 -> tensor<32x1xi32> loc(#loc54)
|
| 25 |
+
%xindex_3 = arith.addi %xindex_2, %xindex_1 : tensor<32x1xi32> loc(#loc54)
|
| 26 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32> loc(#loc55)
|
| 27 |
+
%xmask_4 = arith.cmpi slt, %xindex_3, %xmask : tensor<32x1xi32> loc(#loc55)
|
| 28 |
+
%r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc56)
|
| 29 |
+
%r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc57)
|
| 30 |
+
%x0 = arith.extsi %xindex_3 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc58)
|
| 31 |
+
%x0_6 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc58)
|
| 32 |
+
%x0_7 = arith.remsi %x0, %x0_6 : tensor<32x1xi64> loc(#loc58)
|
| 33 |
+
%x1 = arith.divsi %x0, %x0_6 : tensor<32x1xi64> loc(#loc59)
|
| 34 |
+
%_tmp3_8 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_10 = %_tmp3) -> (tensor<32x16xi64>) : i32 {
|
| 35 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc61)
|
| 36 |
+
%r0_index_11 = arith.addi %r0_index, %r0_base_5 : tensor<1x16xi32> loc(#loc61)
|
| 37 |
+
%r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc62)
|
| 38 |
+
%r0_mask_12 = arith.cmpi slt, %r0_index_11, %r0_mask : tensor<1x16xi32> loc(#loc62)
|
| 39 |
+
%tmp0 = arith.extsi %r0_index_11 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc63)
|
| 40 |
+
%tmp0_13 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc63)
|
| 41 |
+
%tmp0_14 = arith.muli %tmp0_13, %tmp0 : tensor<1x16xi64> loc(#loc63)
|
| 42 |
+
%tmp0_15 = tt.broadcast %x0_7 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc64)
|
| 43 |
+
%tmp0_16 = tt.broadcast %tmp0_14 : tensor<1x16xi64> -> tensor<32x16xi64> loc(#loc64)
|
| 44 |
+
%tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<32x16xi64> loc(#loc64)
|
| 45 |
+
%tmp0_18 = arith.muli %ks0, %ks1 : i64 loc(#loc65)
|
| 46 |
+
%tmp0_19 = tt.splat %tmp0_18 : i64 -> tensor<32x1xi64> loc(#loc66)
|
| 47 |
+
%tmp0_20 = arith.muli %tmp0_19, %x1 : tensor<32x1xi64> loc(#loc66)
|
| 48 |
+
%tmp0_21 = tt.broadcast %tmp0_20 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc67)
|
| 49 |
+
%tmp0_22 = arith.addi %tmp0_17, %tmp0_21 : tensor<32x16xi64> loc(#loc67)
|
| 50 |
+
%tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>> loc(#loc68)
|
| 51 |
+
%tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<32x16x!tt.ptr<i32>>, tensor<32x16xi64> loc(#loc68)
|
| 52 |
+
%tmp0_25 = tt.broadcast %r0_mask_12 : tensor<1x16xi1> -> tensor<32x16xi1> loc(#loc69)
|
| 53 |
+
%tmp0_26 = tt.broadcast %xmask_4 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc69)
|
| 54 |
+
%tmp0_27 = arith.andi %tmp0_25, %tmp0_26 : tensor<32x16xi1> loc(#loc69)
|
| 55 |
+
%tmp0_28 = tt.load %tmp0_24, %tmp0_27, %cst evictionPolicy = evict_last : tensor<32x16x!tt.ptr<i32>> loc(#loc70)
|
| 56 |
+
%tmp1 = arith.extsi %tmp0_28 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc71)
|
| 57 |
+
%tmp4 = arith.addi %_tmp3_10, %tmp1 : tensor<32x16xi64> loc(#loc72)
|
| 58 |
+
%_tmp3_29 = arith.select %tmp0_27, %tmp4, %_tmp3_10 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc73)
|
| 59 |
+
scf.yield %_tmp3_29 : tensor<32x16xi64> loc(#loc27)
|
| 60 |
+
} loc(#loc60)
|
| 61 |
+
%tmp3 = "tt.reduce"(%_tmp3_8) <{axis = 1 : i32}> ({
|
| 62 |
+
^bb0(%tmp3_10: i64 loc(callsite(#loc1 at #loc74)), %tmp3_11: i64 loc(callsite(#loc1 at #loc74))):
|
| 63 |
+
%tmp3_12 = arith.addi %tmp3_10, %tmp3_11 : i64 loc(#loc80)
|
| 64 |
+
tt.reduce.return %tmp3_12 : i64 loc(#loc78)
|
| 65 |
+
}) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc78)
|
| 66 |
+
%tmp3_9 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc75)
|
| 67 |
+
%tmp5 = arith.trunci %tmp3_9 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc76)
|
| 68 |
+
%0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc33)
|
| 69 |
+
%1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc34)
|
| 70 |
+
%2 = arith.extui %1 : i1 to i64 loc(#loc35)
|
| 71 |
+
%3 = arith.muli %ks0, %2 : i64 loc(#loc35)
|
| 72 |
+
%4 = arith.extui %0 : i1 to i64 loc(#loc77)
|
| 73 |
+
%5 = arith.addi %4, %3 : i64 loc(#loc36)
|
| 74 |
+
%6 = tt.splat %5 : i64 -> tensor<32x1xi64> loc(#loc38)
|
| 75 |
+
%7 = arith.muli %x1, %6 : tensor<32x1xi64> loc(#loc38)
|
| 76 |
+
%8 = arith.addi %x0_7, %7 : tensor<32x1xi64> loc(#loc39)
|
| 77 |
+
%9 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>> loc(#loc40)
|
| 78 |
+
%10 = tt.addptr %9, %8 : tensor<32x1x!tt.ptr<i32>>, tensor<32x1xi64> loc(#loc40)
|
| 79 |
+
tt.store %10, %tmp5, %xmask_4 : tensor<32x1x!tt.ptr<i32>> loc(#loc41)
|
| 80 |
+
tt.return loc(#loc42)
|
| 81 |
+
} loc(#loc)
|
| 82 |
+
} loc(#loc)
|
| 83 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":30:40)
|
| 84 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":28:43)
|
| 85 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:28)
|
| 86 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":21:33)
|
| 87 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:36)
|
| 88 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:44)
|
| 89 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":22:23)
|
| 90 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":23:21)
|
| 91 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":24:27)
|
| 92 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":24:37)
|
| 93 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":26:19)
|
| 94 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":27:19)
|
| 95 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":31:31)
|
| 96 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":32:29)
|
| 97 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:43)
|
| 98 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:39)
|
| 99 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:54)
|
| 100 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:58)
|
| 101 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:50)
|
| 102 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:34)
|
| 103 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:73)
|
| 104 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":36:63)
|
| 105 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":37:23)
|
| 106 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":39:23)
|
| 107 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:48)
|
| 108 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":40:8)
|
| 109 |
+
#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 110 |
+
#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 111 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":41:28)
|
| 112 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":42:19)
|
| 113 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:49)
|
| 114 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:75)
|
| 115 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:66)
|
| 116 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:57)
|
| 117 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:41)
|
| 118 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:34)
|
| 119 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:30)
|
| 120 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:25)
|
| 121 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:88)
|
| 122 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xt/cxtmrcn6ndghukfh42cluziweoazfmfrz3jeopcrromwvj5m3lsj.py":43:4)
|
| 123 |
+
#loc49 = loc("_tmp3"(#loc3))
|
| 124 |
+
#loc50 = loc("xoffset"(#loc4))
|
| 125 |
+
#loc51 = loc("xoffset"(#loc5))
|
| 126 |
+
#loc52 = loc("xindex"(#loc6))
|
| 127 |
+
#loc53 = loc("xindex"(#loc7))
|
| 128 |
+
#loc54 = loc("xindex"(#loc8))
|
| 129 |
+
#loc55 = loc("xmask"(#loc9))
|
| 130 |
+
#loc56 = loc("r0_base"(#loc10))
|
| 131 |
+
#loc57 = loc("r0_base"(#loc11))
|
| 132 |
+
#loc58 = loc("x0"(#loc12))
|
| 133 |
+
#loc59 = loc("x1"(#loc13))
|
| 134 |
+
#loc60 = loc("_tmp3"(#loc2))
|
| 135 |
+
#loc61 = loc("r0_index"(#loc14))
|
| 136 |
+
#loc62 = loc("r0_mask"(#loc15))
|
| 137 |
+
#loc63 = loc("tmp0"(#loc16))
|
| 138 |
+
#loc64 = loc("tmp0"(#loc17))
|
| 139 |
+
#loc65 = loc("tmp0"(#loc18))
|
| 140 |
+
#loc66 = loc("tmp0"(#loc19))
|
| 141 |
+
#loc67 = loc("tmp0"(#loc20))
|
| 142 |
+
#loc68 = loc("tmp0"(#loc21))
|
| 143 |
+
#loc69 = loc("tmp0"(#loc22))
|
| 144 |
+
#loc70 = loc("tmp0"(#loc23))
|
| 145 |
+
#loc71 = loc("tmp1"(#loc24))
|
| 146 |
+
#loc72 = loc("tmp4"(#loc25))
|
| 147 |
+
#loc73 = loc("_tmp3"(#loc26))
|
| 148 |
+
#loc75 = loc("tmp3"(#loc31))
|
| 149 |
+
#loc76 = loc("tmp5"(#loc32))
|
| 150 |
+
#loc77 = loc(fused[#loc36, #loc37])
|
| 151 |
+
#loc78 = loc(callsite(#loc28 at #loc74))
|
| 152 |
+
#loc80 = loc(callsite(#loc30 at #loc78))
|
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/__grp__triton_tem_fused_zeros_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_tem_fused_zeros_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.source", "triton_tem_fused_zeros_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttir", "triton_tem_fused_zeros_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttgir", "triton_tem_fused_zeros_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.llir", "triton_tem_fused_zeros_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ptx", "triton_tem_fused_zeros_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.cubin", "triton_tem_fused_zeros_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.json"}}
|
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "d27b4d62c7c8c987ce65c6725faf9fd300a6f1f86eddda35855b227ed6dece60", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 164864, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_zeros_1"}
|
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.llir
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ptx
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.source
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttgir
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttir
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json"}}
|
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin
ADDED
|
Binary file (33 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "db4fe3a24d03312f3dea8e5ee7eb1a7c6d73f10bc6de254a208984a64d7fe777", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1"}
|
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
@global_smem = external addrspace(3) global [0 x i8], align 16
|
| 6 |
+
|
| 7 |
+
; Function Attrs: nounwind
|
| 8 |
+
define ptx_kernel void @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, i32 %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #0 !dbg !4 {
|
| 9 |
+
%14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
|
| 10 |
+
%15 = icmp slt i32 %14, %9, !dbg !8
|
| 11 |
+
%16 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
|
| 12 |
+
%17 = and i32 %16, 511, !dbg !9
|
| 13 |
+
%18 = zext nneg i32 %14 to i64, !dbg !10
|
| 14 |
+
%.frozen = freeze i64 %3, !dbg !10
|
| 15 |
+
%19 = sdiv i64 %18, %.frozen, !dbg !10
|
| 16 |
+
%20 = srem i64 %19, %4, !dbg !11
|
| 17 |
+
%21 = mul i64 %19, %.frozen, !dbg !12
|
| 18 |
+
%.decomposed = sub i64 %18, %21, !dbg !12
|
| 19 |
+
%22 = sdiv i64 %18, %7, !dbg !13
|
| 20 |
+
%23 = shl nsw i64 %20, 7, !dbg !14
|
| 21 |
+
%24 = shl nuw nsw i64 %.decomposed, 7, !dbg !15
|
| 22 |
+
%25 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !16
|
| 23 |
+
%26 = and i32 %16, 127
|
| 24 |
+
%27 = zext nneg i32 %26 to i64
|
| 25 |
+
%28 = or disjoint i64 %24, %27
|
| 26 |
+
%29 = icmp slt i64 %28, %6
|
| 27 |
+
%30 = icmp sge i64 %28, %8
|
| 28 |
+
%31 = tail call i64 @llvm.smin.i64(i64 %8, i64 0)
|
| 29 |
+
%32 = sub nsw i64 %.decomposed, %20
|
| 30 |
+
%33 = shl nsw i64 %32, 7
|
| 31 |
+
%34 = zext nneg i32 %17 to i64, !dbg !17
|
| 32 |
+
%35 = zext nneg i32 %26 to i64, !dbg !17
|
| 33 |
+
%36 = zext nneg i32 %16 to i64, !dbg !17
|
| 34 |
+
%37 = insertelement <4 x i1> poison, i1 %15, i64 0, !dbg !18
|
| 35 |
+
%38 = shufflevector <4 x i1> %37, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !18
|
| 36 |
+
%39 = insertelement <4 x i1> poison, i1 %29, i64 0, !dbg !19
|
| 37 |
+
%40 = shufflevector <4 x i1> %39, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !19
|
| 38 |
+
%41 = insertelement <4 x i64> poison, i64 %23, i64 0, !dbg !20
|
| 39 |
+
%42 = shufflevector <4 x i64> %41, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !20
|
| 40 |
+
%43 = insertelement <4 x i64> poison, i64 %5, i64 0, !dbg !21
|
| 41 |
+
%44 = shufflevector <4 x i64> %43, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !21
|
| 42 |
+
%45 = insertelement <4 x i64> poison, i64 %28, i64 0, !dbg !22
|
| 43 |
+
%46 = shufflevector <4 x i64> %45, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !22
|
| 44 |
+
%47 = insertelement <4 x i1> poison, i1 %30, i64 0, !dbg !23
|
| 45 |
+
%48 = shufflevector <4 x i1> %47, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !23
|
| 46 |
+
%49 = insertelement <4 x i64> poison, i64 %33, i64 0, !dbg !24
|
| 47 |
+
%50 = shufflevector <4 x i64> %49, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !24
|
| 48 |
+
%51 = insertelement <4 x i64> poison, i64 %8, i64 0, !dbg !25
|
| 49 |
+
%52 = shufflevector <4 x i64> %51, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !25
|
| 50 |
+
br label %53, !dbg !17
|
| 51 |
+
|
| 52 |
+
53: ; preds = %13, %53
|
| 53 |
+
%indvars.iv = phi i64 [ 0, %13 ], [ %indvars.iv.next, %53 ]
|
| 54 |
+
%54 = phi <4 x i64> [ zeroinitializer, %13 ], [ %128, %53 ]
|
| 55 |
+
%55 = or disjoint i64 %indvars.iv, %34, !dbg !26
|
| 56 |
+
%56 = or disjoint i64 %indvars.iv, %36, !dbg !26
|
| 57 |
+
%57 = lshr i64 %55, 7, !dbg !27
|
| 58 |
+
%58 = lshr i64 %56, 7, !dbg !27
|
| 59 |
+
%59 = trunc nuw nsw i64 %58 to i32, !dbg !27
|
| 60 |
+
%60 = or i32 %59, 4, !dbg !27
|
| 61 |
+
%61 = or disjoint i64 %57, 8, !dbg !27
|
| 62 |
+
%62 = or i32 %59, 12, !dbg !27
|
| 63 |
+
%63 = zext nneg i32 %60 to i64, !dbg !20
|
| 64 |
+
%64 = zext nneg i32 %62 to i64, !dbg !20
|
| 65 |
+
%65 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28
|
| 66 |
+
%66 = sub nsw i64 %35, %57, !dbg !29
|
| 67 |
+
%67 = sub nsw i32 %26, %60, !dbg !29
|
| 68 |
+
%68 = sub nsw i64 %35, %61, !dbg !29
|
| 69 |
+
%69 = sub nsw i32 %26, %62, !dbg !29
|
| 70 |
+
%70 = sext i32 %67 to i64, !dbg !30
|
| 71 |
+
%71 = sext i32 %69 to i64, !dbg !30
|
| 72 |
+
%72 = insertelement <4 x i64> poison, i64 %57, i64 0, !dbg !20
|
| 73 |
+
%73 = insertelement <4 x i64> %72, i64 %63, i64 1, !dbg !20
|
| 74 |
+
%74 = insertelement <4 x i64> %73, i64 %61, i64 2, !dbg !20
|
| 75 |
+
%75 = insertelement <4 x i64> %74, i64 %64, i64 3, !dbg !20
|
| 76 |
+
%76 = or disjoint <4 x i64> %42, %75, !dbg !20
|
| 77 |
+
%77 = icmp slt <4 x i64> %76, %44, !dbg !21
|
| 78 |
+
%78 = and <4 x i1> %40, %77, !dbg !19
|
| 79 |
+
%79 = icmp sge <4 x i64> %76, %46, !dbg !22
|
| 80 |
+
%80 = extractelement <4 x i1> %78, i64 0, !dbg !31
|
| 81 |
+
%81 = and i1 %15, %80, !dbg !31
|
| 82 |
+
%82 = extractelement <4 x i1> %78, i64 1, !dbg !31
|
| 83 |
+
%83 = and i1 %15, %82, !dbg !31
|
| 84 |
+
%84 = extractelement <4 x i1> %78, i64 2, !dbg !31
|
| 85 |
+
%85 = and i1 %15, %84, !dbg !31
|
| 86 |
+
%86 = extractelement <4 x i1> %78, i64 3, !dbg !31
|
| 87 |
+
%87 = and i1 %15, %86, !dbg !31
|
| 88 |
+
%88 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %65, i1 %81) #5, !dbg !28
|
| 89 |
+
%89 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28
|
| 90 |
+
%90 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %89, i1 %83) #5, !dbg !28
|
| 91 |
+
%91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28
|
| 92 |
+
%92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %91, i1 %85) #5, !dbg !28
|
| 93 |
+
%93 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28
|
| 94 |
+
%94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %93, i1 %87) #5, !dbg !28
|
| 95 |
+
%95 = insertelement <4 x i64> poison, i64 %88, i64 0, !dbg !32
|
| 96 |
+
%96 = insertelement <4 x i64> %95, i64 %90, i64 1, !dbg !32
|
| 97 |
+
%97 = insertelement <4 x i64> %96, i64 %92, i64 2, !dbg !32
|
| 98 |
+
%98 = insertelement <4 x i64> %97, i64 %94, i64 3, !dbg !32
|
| 99 |
+
%99 = icmp slt <4 x i64> %46, %98, !dbg !32
|
| 100 |
+
%100 = icmp slt <4 x i64> %76, %98, !dbg !33
|
| 101 |
+
%101 = and <4 x i1> %99, %100, !dbg !34
|
| 102 |
+
%102 = and <4 x i1> %79, %101, !dbg !35
|
| 103 |
+
%103 = srem i64 %28, %8, !dbg !36
|
| 104 |
+
%.not = icmp eq i64 %103, 0, !dbg !37
|
| 105 |
+
%104 = select i1 %.not, i64 0, i64 %31, !dbg !38
|
| 106 |
+
%105 = add nsw i64 %104, %103, !dbg !38
|
| 107 |
+
%106 = insertelement <4 x i64> poison, i64 %105, i64 0, !dbg !39
|
| 108 |
+
%107 = shufflevector <4 x i64> %106, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !39
|
| 109 |
+
%108 = icmp slt <4 x i64> %107, %98, !dbg !39
|
| 110 |
+
%109 = insertelement <4 x i64> poison, i64 %66, i64 0, !dbg !24
|
| 111 |
+
%110 = insertelement <4 x i64> %109, i64 %70, i64 1, !dbg !24
|
| 112 |
+
%111 = insertelement <4 x i64> %110, i64 %68, i64 2, !dbg !24
|
| 113 |
+
%112 = insertelement <4 x i64> %111, i64 %71, i64 3, !dbg !24
|
| 114 |
+
%113 = add nsw <4 x i64> %50, %112, !dbg !24
|
| 115 |
+
%114 = srem <4 x i64> %113, %52, !dbg !25
|
| 116 |
+
%115 = icmp ne <4 x i64> %114, zeroinitializer, !dbg !40
|
| 117 |
+
%116 = xor <4 x i64> %114, %52, !dbg !41
|
| 118 |
+
%117 = icmp slt <4 x i64> %116, zeroinitializer, !dbg !41
|
| 119 |
+
%118 = and <4 x i1> %115, %117, !dbg !42
|
| 120 |
+
%119 = select <4 x i1> %118, <4 x i64> %52, <4 x i64> zeroinitializer, !dbg !43
|
| 121 |
+
%120 = sub <4 x i64> zeroinitializer, %119, !dbg !44
|
| 122 |
+
%121 = icmp eq <4 x i64> %114, %120, !dbg !44
|
| 123 |
+
%122 = and <4 x i1> %108, %121, !dbg !23
|
| 124 |
+
%123 = and <4 x i1> %48, %122, !dbg !23
|
| 125 |
+
%124 = or <4 x i1> %102, %123, !dbg !45
|
| 126 |
+
%125 = select <4 x i1> %38, <4 x i1> %78, <4 x i1> zeroinitializer, !dbg !18
|
| 127 |
+
%126 = select <4 x i1> %125, <4 x i1> %124, <4 x i1> zeroinitializer, !dbg !18
|
| 128 |
+
%127 = zext <4 x i1> %126 to <4 x i64>, !dbg !18
|
| 129 |
+
%128 = add <4 x i64> %54, %127, !dbg !18
|
| 130 |
+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !17
|
| 131 |
+
%129 = icmp samesign ult i64 %indvars.iv, 14336, !dbg !17
|
| 132 |
+
br i1 %129, label %53, label %130, !dbg !17
|
| 133 |
+
|
| 134 |
+
130: ; preds = %53
|
| 135 |
+
%131 = and i32 %16, 31, !dbg !9
|
| 136 |
+
%132 = lshr i32 %16, 5, !dbg !9
|
| 137 |
+
%133 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %128), !dbg !46
|
| 138 |
+
%extelt.offset = lshr i64 %133, 32, !dbg !50
|
| 139 |
+
%134 = trunc nuw i64 %extelt.offset to i32, !dbg !50
|
| 140 |
+
%135 = trunc i64 %133 to i32, !dbg !50
|
| 141 |
+
%136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 16, i32 31), !dbg !50
|
| 142 |
+
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 16, i32 31), !dbg !50
|
| 143 |
+
%138 = insertelement <2 x i32> poison, i32 %136, i64 0, !dbg !50
|
| 144 |
+
%139 = insertelement <2 x i32> %138, i32 %137, i64 1, !dbg !50
|
| 145 |
+
%140 = bitcast <2 x i32> %139 to i64, !dbg !50
|
| 146 |
+
%141 = add i64 %133, %140, !dbg !46
|
| 147 |
+
%extelt.offset1 = lshr i64 %141, 32, !dbg !50
|
| 148 |
+
%142 = trunc nuw i64 %extelt.offset1 to i32, !dbg !50
|
| 149 |
+
%143 = trunc i64 %141 to i32, !dbg !50
|
| 150 |
+
%144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %143, i32 8, i32 31), !dbg !50
|
| 151 |
+
%145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 8, i32 31), !dbg !50
|
| 152 |
+
%146 = insertelement <2 x i32> poison, i32 %144, i64 0, !dbg !50
|
| 153 |
+
%147 = insertelement <2 x i32> %146, i32 %145, i64 1, !dbg !50
|
| 154 |
+
%148 = bitcast <2 x i32> %147 to i64, !dbg !50
|
| 155 |
+
%149 = add i64 %141, %148, !dbg !46
|
| 156 |
+
%extelt.offset2 = lshr i64 %149, 32, !dbg !50
|
| 157 |
+
%150 = trunc nuw i64 %extelt.offset2 to i32, !dbg !50
|
| 158 |
+
%151 = trunc i64 %149 to i32, !dbg !50
|
| 159 |
+
%152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 4, i32 31), !dbg !50
|
| 160 |
+
%153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 4, i32 31), !dbg !50
|
| 161 |
+
%154 = insertelement <2 x i32> poison, i32 %152, i64 0, !dbg !50
|
| 162 |
+
%155 = insertelement <2 x i32> %154, i32 %153, i64 1, !dbg !50
|
| 163 |
+
%156 = bitcast <2 x i32> %155 to i64, !dbg !50
|
| 164 |
+
%157 = add i64 %149, %156, !dbg !46
|
| 165 |
+
%extelt.offset3 = lshr i64 %157, 32, !dbg !50
|
| 166 |
+
%158 = trunc nuw i64 %extelt.offset3 to i32, !dbg !50
|
| 167 |
+
%159 = trunc i64 %157 to i32, !dbg !50
|
| 168 |
+
%160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 2, i32 31), !dbg !50
|
| 169 |
+
%161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 2, i32 31), !dbg !50
|
| 170 |
+
%162 = insertelement <2 x i32> poison, i32 %160, i64 0, !dbg !50
|
| 171 |
+
%163 = insertelement <2 x i32> %162, i32 %161, i64 1, !dbg !50
|
| 172 |
+
%164 = bitcast <2 x i32> %163 to i64, !dbg !50
|
| 173 |
+
%165 = add i64 %157, %164, !dbg !46
|
| 174 |
+
%extelt.offset4 = lshr i64 %165, 32, !dbg !50
|
| 175 |
+
%166 = trunc nuw i64 %extelt.offset4 to i32, !dbg !50
|
| 176 |
+
%167 = trunc i64 %165 to i32, !dbg !50
|
| 177 |
+
%168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 1, i32 31), !dbg !50
|
| 178 |
+
%169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %166, i32 1, i32 31), !dbg !50
|
| 179 |
+
%170 = insertelement <2 x i32> poison, i32 %168, i64 0, !dbg !50
|
| 180 |
+
%171 = insertelement <2 x i32> %170, i32 %169, i64 1, !dbg !50
|
| 181 |
+
%172 = bitcast <2 x i32> %171 to i64, !dbg !50
|
| 182 |
+
%173 = add i64 %165, %172, !dbg !46
|
| 183 |
+
%174 = and i32 %132, 15, !dbg !50
|
| 184 |
+
%175 = icmp eq i32 %131, 0, !dbg !50
|
| 185 |
+
%176 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %174, !dbg !50
|
| 186 |
+
%177 = insertelement <1 x i64> poison, i64 %173, i64 0, !dbg !50
|
| 187 |
+
tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %176, <1 x i64> %177, i1 %175) #5, !dbg !50
|
| 188 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50
|
| 189 |
+
%178 = icmp samesign ult i32 %16, 16, !dbg !50
|
| 190 |
+
%179 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %16, !dbg !50
|
| 191 |
+
%180 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %179, i1 %178) #5, !dbg !50
|
| 192 |
+
%extelt.offset5 = lshr i64 %180, 32, !dbg !50
|
| 193 |
+
%181 = trunc nuw i64 %extelt.offset5 to i32, !dbg !50
|
| 194 |
+
%182 = trunc i64 %180 to i32, !dbg !50
|
| 195 |
+
%183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 8, i32 31), !dbg !50
|
| 196 |
+
%184 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 8, i32 31), !dbg !50
|
| 197 |
+
%185 = insertelement <2 x i32> poison, i32 %183, i64 0, !dbg !50
|
| 198 |
+
%186 = insertelement <2 x i32> %185, i32 %184, i64 1, !dbg !50
|
| 199 |
+
%187 = bitcast <2 x i32> %186 to i64, !dbg !50
|
| 200 |
+
%188 = add i64 %180, %187, !dbg !46
|
| 201 |
+
%extelt.offset6 = lshr i64 %188, 32, !dbg !50
|
| 202 |
+
%189 = trunc nuw i64 %extelt.offset6 to i32, !dbg !50
|
| 203 |
+
%190 = trunc i64 %188 to i32, !dbg !50
|
| 204 |
+
%191 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %190, i32 4, i32 31), !dbg !50
|
| 205 |
+
%192 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 4, i32 31), !dbg !50
|
| 206 |
+
%193 = insertelement <2 x i32> poison, i32 %191, i64 0, !dbg !50
|
| 207 |
+
%194 = insertelement <2 x i32> %193, i32 %192, i64 1, !dbg !50
|
| 208 |
+
%195 = bitcast <2 x i32> %194 to i64, !dbg !50
|
| 209 |
+
%196 = add i64 %188, %195, !dbg !46
|
| 210 |
+
%extelt.offset7 = lshr i64 %196, 32, !dbg !50
|
| 211 |
+
%197 = trunc nuw i64 %extelt.offset7 to i32, !dbg !50
|
| 212 |
+
%198 = trunc i64 %196 to i32, !dbg !50
|
| 213 |
+
%199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 2, i32 31), !dbg !50
|
| 214 |
+
%200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 2, i32 31), !dbg !50
|
| 215 |
+
%201 = insertelement <2 x i32> poison, i32 %199, i64 0, !dbg !50
|
| 216 |
+
%202 = insertelement <2 x i32> %201, i32 %200, i64 1, !dbg !50
|
| 217 |
+
%203 = bitcast <2 x i32> %202 to i64, !dbg !50
|
| 218 |
+
%204 = add i64 %196, %203, !dbg !46
|
| 219 |
+
%extelt.offset8 = lshr i64 %204, 32, !dbg !50
|
| 220 |
+
%205 = trunc nuw i64 %extelt.offset8 to i32, !dbg !50
|
| 221 |
+
%206 = trunc i64 %204 to i32, !dbg !50
|
| 222 |
+
%207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 1, i32 31), !dbg !50
|
| 223 |
+
%208 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 1, i32 31), !dbg !50
|
| 224 |
+
%209 = insertelement <2 x i32> poison, i32 %207, i64 0, !dbg !50
|
| 225 |
+
%210 = insertelement <2 x i32> %209, i32 %208, i64 1, !dbg !50
|
| 226 |
+
%211 = bitcast <2 x i32> %210 to i64, !dbg !50
|
| 227 |
+
%212 = add i64 %204, %211, !dbg !46
|
| 228 |
+
%213 = icmp eq i32 %16, 0, !dbg !50
|
| 229 |
+
%214 = insertelement <1 x i64> poison, i64 %212, i64 0, !dbg !50
|
| 230 |
+
tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %179, <1 x i64> %214, i1 %213) #5, !dbg !50
|
| 231 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50
|
| 232 |
+
%215 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !50
|
| 233 |
+
%216 = add i64 %215, -1, !dbg !51
|
| 234 |
+
%217 = icmp ult i64 %216, 16383, !dbg !51
|
| 235 |
+
%218 = zext i1 %217 to i32, !dbg !52
|
| 236 |
+
%219 = icmp eq i64 %215, 16384, !dbg !53
|
| 237 |
+
%220 = zext i1 %219 to i32, !dbg !52
|
| 238 |
+
%221 = getelementptr i32, ptr addrspace(1) %1, i64 %18, !dbg !54
|
| 239 |
+
%222 = icmp eq i32 %17, 0, !dbg !55
|
| 240 |
+
%223 = and i1 %222, %15, !dbg !55
|
| 241 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %218, ptr addrspace(1) %221, i1 %223) #5, !dbg !55
|
| 242 |
+
%224 = getelementptr i32, ptr addrspace(1) %2, i64 %18, !dbg !56
|
| 243 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %220, ptr addrspace(1) %224, i1 %223) #5, !dbg !57
|
| 244 |
+
ret void, !dbg !58
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 248 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 249 |
+
|
| 250 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 251 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 252 |
+
|
| 253 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
| 254 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
| 255 |
+
|
| 256 |
+
; Function Attrs: convergent nocallback nounwind
|
| 257 |
+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
|
| 258 |
+
|
| 259 |
+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 260 |
+
declare i64 @llvm.smin.i64(i64, i64) #4
|
| 261 |
+
|
| 262 |
+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 263 |
+
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4
|
| 264 |
+
|
| 265 |
+
attributes #0 = { nounwind "nvvm.reqntid"="512" }
|
| 266 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 267 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
| 268 |
+
attributes #3 = { convergent nocallback nounwind }
|
| 269 |
+
attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 270 |
+
attributes #5 = { nounwind }
|
| 271 |
+
|
| 272 |
+
!llvm.dbg.cu = !{!0}
|
| 273 |
+
!llvm.module.flags = !{!2, !3}
|
| 274 |
+
|
| 275 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 276 |
+
!1 = !DIFile(filename: "cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av")
|
| 277 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 278 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 279 |
+
!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", linkageName: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 280 |
+
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
|
| 281 |
+
!6 = !{}
|
| 282 |
+
!7 = !DILocation(line: 22, column: 28, scope: !4)
|
| 283 |
+
!8 = !DILocation(line: 24, column: 21, scope: !4)
|
| 284 |
+
!9 = !DILocation(line: 25, column: 37, scope: !4)
|
| 285 |
+
!10 = !DILocation(line: 27, column: 21, scope: !4)
|
| 286 |
+
!11 = !DILocation(line: 27, column: 28, scope: !4)
|
| 287 |
+
!12 = !DILocation(line: 28, column: 19, scope: !4)
|
| 288 |
+
!13 = !DILocation(line: 29, column: 19, scope: !4)
|
| 289 |
+
!14 = !DILocation(line: 39, column: 26, scope: !4)
|
| 290 |
+
!15 = !DILocation(line: 42, column: 26, scope: !4)
|
| 291 |
+
!16 = !DILocation(line: 49, column: 35, scope: !4)
|
| 292 |
+
!17 = !DILocation(line: 32, column: 40, scope: !4)
|
| 293 |
+
!18 = !DILocation(line: 86, column: 50, scope: !4)
|
| 294 |
+
!19 = !DILocation(line: 45, column: 22, scope: !4)
|
| 295 |
+
!20 = !DILocation(line: 39, column: 22, scope: !4)
|
| 296 |
+
!21 = !DILocation(line: 41, column: 22, scope: !4)
|
| 297 |
+
!22 = !DILocation(line: 48, column: 23, scope: !4)
|
| 298 |
+
!23 = !DILocation(line: 79, column: 24, scope: !4)
|
| 299 |
+
!24 = !DILocation(line: 69, column: 51, scope: !4)
|
| 300 |
+
!25 = !DILocation(line: 70, column: 25, scope: !4)
|
| 301 |
+
!26 = !DILocation(line: 33, column: 31, scope: !4)
|
| 302 |
+
!27 = !DILocation(line: 37, column: 27, scope: !4)
|
| 303 |
+
!28 = !DILocation(line: 49, column: 77, scope: !4)
|
| 304 |
+
!29 = !DILocation(line: 69, column: 24, scope: !4)
|
| 305 |
+
!30 = !DILocation(line: 69, column: 38, scope: !4)
|
| 306 |
+
!31 = !DILocation(line: 49, column: 94, scope: !4)
|
| 307 |
+
!32 = !DILocation(line: 50, column: 23, scope: !4)
|
| 308 |
+
!33 = !DILocation(line: 51, column: 23, scope: !4)
|
| 309 |
+
!34 = !DILocation(line: 52, column: 24, scope: !4)
|
| 310 |
+
!35 = !DILocation(line: 53, column: 23, scope: !4)
|
| 311 |
+
!36 = !DILocation(line: 58, column: 24, scope: !4)
|
| 312 |
+
!37 = !DILocation(line: 60, column: 25, scope: !4)
|
| 313 |
+
!38 = !DILocation(line: 66, column: 39, scope: !4)
|
| 314 |
+
!39 = !DILocation(line: 67, column: 24, scope: !4)
|
| 315 |
+
!40 = !DILocation(line: 71, column: 25, scope: !4)
|
| 316 |
+
!41 = !DILocation(line: 73, column: 25, scope: !4)
|
| 317 |
+
!42 = !DILocation(line: 74, column: 24, scope: !4)
|
| 318 |
+
!43 = !DILocation(line: 76, column: 39, scope: !4)
|
| 319 |
+
!44 = !DILocation(line: 78, column: 25, scope: !4)
|
| 320 |
+
!45 = !DILocation(line: 80, column: 24, scope: !4)
|
| 321 |
+
!46 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !49)
|
| 322 |
+
!47 = distinct !DILexicalBlockFile(scope: !4, file: !48, discriminator: 0)
|
| 323 |
+
!48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language")
|
| 324 |
+
!49 = !DILocation(line: 87, column: 27, scope: !4)
|
| 325 |
+
!50 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !49)
|
| 326 |
+
!51 = !DILocation(line: 92, column: 20, scope: !4)
|
| 327 |
+
!52 = !DILocation(line: 0, scope: !4)
|
| 328 |
+
!53 = !DILocation(line: 95, column: 21, scope: !4)
|
| 329 |
+
!54 = !DILocation(line: 98, column: 25, scope: !4)
|
| 330 |
+
!55 = !DILocation(line: 98, column: 37, scope: !4)
|
| 331 |
+
!56 = !DILocation(line: 99, column: 25, scope: !4)
|
| 332 |
+
!57 = !DILocation(line: 99, column: 37, scope: !4)
|
| 333 |
+
!58 = !DILocation(line: 99, column: 4, scope: !4)
|
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx
ADDED
|
@@ -0,0 +1,809 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 // -- Begin function triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1
|
| 10 |
+
.extern .shared .align 16 .b8 global_smem[];
|
| 11 |
+
// @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1
|
| 12 |
+
.visible .entry triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(
|
| 13 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0,
|
| 14 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1,
|
| 15 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2,
|
| 16 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3,
|
| 17 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4,
|
| 18 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5,
|
| 19 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6,
|
| 20 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7,
|
| 21 |
+
.param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8,
|
| 22 |
+
.param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9,
|
| 23 |
+
.param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_10,
|
| 24 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_11,
|
| 25 |
+
.param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_12
|
| 26 |
+
)
|
| 27 |
+
.reqntid 512
|
| 28 |
+
{
|
| 29 |
+
.reg .pred %p<89>;
|
| 30 |
+
.reg .b32 %r<77>;
|
| 31 |
+
.reg .b64 %rd<216>;
|
| 32 |
+
.loc 1 18 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:18:0
|
| 33 |
+
$L__func_begin0:
|
| 34 |
+
.loc 1 18 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:18:0
|
| 35 |
+
|
| 36 |
+
// %bb.0:
|
| 37 |
+
ld.param.b64 %rd65, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4];
|
| 38 |
+
$L__tmp0:
|
| 39 |
+
.loc 1 22 28 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:22:28
|
| 40 |
+
mov.u32 %r8, %ctaid.x;
|
| 41 |
+
ld.param.b64 %rd70, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3];
|
| 42 |
+
.loc 1 27 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:27:21
|
| 43 |
+
cvt.u64.u32 %rd1, %r8;
|
| 44 |
+
and.b64 %rd71, %rd70, -4294967296;
|
| 45 |
+
setp.ne.b64 %p21, %rd71, 0;
|
| 46 |
+
cvt.u32.u64 %r75, %rd1;
|
| 47 |
+
@%p21 bra $L__BB0_2;
|
| 48 |
+
bra.uni $L__BB0_1;
|
| 49 |
+
$L__BB0_2:
|
| 50 |
+
div.s64 %rd205, %rd1, %rd70;
|
| 51 |
+
bra.uni $L__BB0_3;
|
| 52 |
+
$L__BB0_1:
|
| 53 |
+
cvt.u32.u64 %r9, %rd70;
|
| 54 |
+
div.u32 %r11, %r75, %r9;
|
| 55 |
+
cvt.u64.u32 %rd205, %r11;
|
| 56 |
+
$L__BB0_3:
|
| 57 |
+
.loc 1 0 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0:21
|
| 58 |
+
mov.u32 %r1, %tid.x;
|
| 59 |
+
ld.param.b64 %rd68, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7];
|
| 60 |
+
.loc 1 27 28 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:27:28
|
| 61 |
+
or.b64 %rd72, %rd205, %rd65;
|
| 62 |
+
and.b64 %rd73, %rd72, -4294967296;
|
| 63 |
+
setp.ne.b64 %p22, %rd73, 0;
|
| 64 |
+
@%p22 bra $L__BB0_5;
|
| 65 |
+
bra.uni $L__BB0_4;
|
| 66 |
+
$L__BB0_5:
|
| 67 |
+
rem.s64 %rd206, %rd205, %rd65;
|
| 68 |
+
bra.uni $L__BB0_6;
|
| 69 |
+
$L__BB0_4:
|
| 70 |
+
cvt.u32.u64 %r12, %rd65;
|
| 71 |
+
cvt.u32.u64 %r13, %rd205;
|
| 72 |
+
rem.u32 %r14, %r13, %r12;
|
| 73 |
+
cvt.u64.u32 %rd206, %r14;
|
| 74 |
+
$L__BB0_6:
|
| 75 |
+
.loc 1 0 28 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0:28
|
| 76 |
+
ld.param.b32 %r7, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9];
|
| 77 |
+
ld.param.b64 %rd69, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8];
|
| 78 |
+
ld.param.b64 %rd67, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6];
|
| 79 |
+
ld.param.b64 %rd62, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0];
|
| 80 |
+
and.b32 %r2, %r1, 511;
|
| 81 |
+
.loc 1 28 19 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:28:19
|
| 82 |
+
mul.lo.s64 %rd74, %rd205, %rd70;
|
| 83 |
+
sub.s64 %rd9, %rd1, %rd74;
|
| 84 |
+
.loc 1 29 19 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:29:19
|
| 85 |
+
and.b64 %rd75, %rd68, -4294967296;
|
| 86 |
+
setp.ne.b64 %p23, %rd75, 0;
|
| 87 |
+
@%p23 bra $L__BB0_8;
|
| 88 |
+
bra.uni $L__BB0_7;
|
| 89 |
+
$L__BB0_8:
|
| 90 |
+
div.s64 %rd207, %rd1, %rd68;
|
| 91 |
+
bra.uni $L__BB0_9;
|
| 92 |
+
$L__BB0_7:
|
| 93 |
+
cvt.u32.u64 %r15, %rd68;
|
| 94 |
+
div.u32 %r17, %r75, %r15;
|
| 95 |
+
cvt.u64.u32 %rd207, %r17;
|
| 96 |
+
$L__BB0_9:
|
| 97 |
+
.loc 1 0 19 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0:19
|
| 98 |
+
ld.param.b64 %rd66, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5];
|
| 99 |
+
ld.param.b64 %rd64, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2];
|
| 100 |
+
ld.param.b64 %rd63, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1];
|
| 101 |
+
.loc 1 24 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:24:21
|
| 102 |
+
setp.lt.s32 %p1, %r75, %r7;
|
| 103 |
+
.loc 1 39 26 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:39:26
|
| 104 |
+
shl.b64 %rd16, %rd206, 7;
|
| 105 |
+
.loc 1 42 26 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:42:26
|
| 106 |
+
shl.b64 %rd81, %rd9, 7;
|
| 107 |
+
.loc 1 49 35 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:35
|
| 108 |
+
shl.b64 %rd82, %rd207, 3;
|
| 109 |
+
add.s64 %rd90, %rd62, %rd82;
|
| 110 |
+
and.b32 %r3, %r1, 127;
|
| 111 |
+
cvt.u64.u32 %rd83, %r3;
|
| 112 |
+
or.b64 %rd24, %rd81, %rd83;
|
| 113 |
+
setp.lt.s64 %p5, %rd24, %rd67;
|
| 114 |
+
setp.ge.s64 %p9, %rd24, %rd69;
|
| 115 |
+
min.s64 %rd15, %rd69, 0;
|
| 116 |
+
sub.s64 %rd84, %rd9, %rd206;
|
| 117 |
+
shl.b64 %rd28, %rd84, 7;
|
| 118 |
+
.loc 1 32 40 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:32:40
|
| 119 |
+
cvt.u64.u32 %rd85, %r2;
|
| 120 |
+
cvt.u64.u32 %rd86, %r1;
|
| 121 |
+
shr.u64 %rd87, %rd86, 7;
|
| 122 |
+
cvt.u32.u64 %r76, %rd87;
|
| 123 |
+
shr.u64 %rd209, %rd85, 7;
|
| 124 |
+
sub.s64 %rd208, %rd83, %rd209;
|
| 125 |
+
mov.b64 %rd211, 0;
|
| 126 |
+
mov.b64 %rd210, -2048;
|
| 127 |
+
mov.b64 %rd212, %rd211;
|
| 128 |
+
mov.b64 %rd213, %rd211;
|
| 129 |
+
mov.b64 %rd214, %rd211;
|
| 130 |
+
bra.uni $L__BB0_10;
|
| 131 |
+
$L__BB0_12: // in Loop: Header=BB0_10 Depth=1
|
| 132 |
+
.loc 1 58 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:58:24
|
| 133 |
+
rem.s64 %rd215, %rd24, %rd69;
|
| 134 |
+
$L__BB0_13: // in Loop: Header=BB0_10 Depth=1
|
| 135 |
+
.loc 1 0 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0
|
| 136 |
+
sub.s32 %r22, %r3, %r20;
|
| 137 |
+
add.s64 %rd45, %rd208, -8;
|
| 138 |
+
sub.s32 %r23, %r3, %r21;
|
| 139 |
+
cvt.s64.s32 %rd46, %r22;
|
| 140 |
+
cvt.s64.s32 %rd47, %r23;
|
| 141 |
+
.loc 1 60 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:60:25
|
| 142 |
+
setp.eq.b64 %p42, %rd215, 0;
|
| 143 |
+
.loc 1 66 39 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:66:39
|
| 144 |
+
selp.b64 %rd117, 0, %rd15, %p42;
|
| 145 |
+
add.s64 %rd118, %rd117, %rd215;
|
| 146 |
+
.loc 1 67 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:67:24
|
| 147 |
+
setp.lt.s64 %p43, %rd118, %rd89;
|
| 148 |
+
setp.lt.s64 %p44, %rd118, %rd93;
|
| 149 |
+
setp.lt.s64 %p45, %rd118, %rd97;
|
| 150 |
+
setp.lt.s64 %p46, %rd118, %rd101;
|
| 151 |
+
.loc 1 69 51 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:69:51
|
| 152 |
+
add.s64 %rd119, %rd28, %rd47;
|
| 153 |
+
add.s64 %rd120, %rd28, %rd45;
|
| 154 |
+
add.s64 %rd121, %rd28, %rd46;
|
| 155 |
+
add.s64 %rd122, %rd28, %rd208;
|
| 156 |
+
.loc 1 70 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:70:25
|
| 157 |
+
rem.s64 %rd123, %rd122, %rd69;
|
| 158 |
+
rem.s64 %rd124, %rd121, %rd69;
|
| 159 |
+
rem.s64 %rd125, %rd120, %rd69;
|
| 160 |
+
rem.s64 %rd126, %rd119, %rd69;
|
| 161 |
+
.loc 1 71 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:71:25
|
| 162 |
+
setp.ne.b64 %p47, %rd126, 0;
|
| 163 |
+
setp.ne.b64 %p48, %rd125, 0;
|
| 164 |
+
setp.ne.b64 %p49, %rd124, 0;
|
| 165 |
+
setp.ne.b64 %p50, %rd123, 0;
|
| 166 |
+
.loc 1 73 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:73:25
|
| 167 |
+
xor.b64 %rd127, %rd126, %rd69;
|
| 168 |
+
xor.b64 %rd128, %rd125, %rd69;
|
| 169 |
+
xor.b64 %rd129, %rd124, %rd69;
|
| 170 |
+
xor.b64 %rd130, %rd123, %rd69;
|
| 171 |
+
.loc 1 76 39 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:76:39
|
| 172 |
+
shr.s64 %rd131, %rd130, 63;
|
| 173 |
+
and.b64 %rd132, %rd131, %rd69;
|
| 174 |
+
selp.b64 %rd133, %rd132, 0, %p50;
|
| 175 |
+
shr.s64 %rd134, %rd129, 63;
|
| 176 |
+
and.b64 %rd135, %rd134, %rd69;
|
| 177 |
+
selp.b64 %rd136, %rd135, 0, %p49;
|
| 178 |
+
shr.s64 %rd137, %rd128, 63;
|
| 179 |
+
and.b64 %rd138, %rd137, %rd69;
|
| 180 |
+
selp.b64 %rd139, %rd138, 0, %p48;
|
| 181 |
+
shr.s64 %rd140, %rd127, 63;
|
| 182 |
+
and.b64 %rd141, %rd140, %rd69;
|
| 183 |
+
selp.b64 %rd142, %rd141, 0, %p47;
|
| 184 |
+
.loc 1 78 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:78:25
|
| 185 |
+
neg.s64 %rd143, %rd142;
|
| 186 |
+
neg.s64 %rd144, %rd139;
|
| 187 |
+
neg.s64 %rd145, %rd136;
|
| 188 |
+
neg.s64 %rd146, %rd133;
|
| 189 |
+
setp.eq.b64 %p51, %rd123, %rd146;
|
| 190 |
+
setp.eq.b64 %p52, %rd124, %rd145;
|
| 191 |
+
setp.eq.b64 %p53, %rd125, %rd144;
|
| 192 |
+
setp.eq.b64 %p54, %rd126, %rd143;
|
| 193 |
+
.loc 1 79 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:79:24
|
| 194 |
+
and.pred %p55, %p46, %p54;
|
| 195 |
+
and.pred %p57, %p45, %p53;
|
| 196 |
+
and.pred %p59, %p44, %p52;
|
| 197 |
+
and.pred %p61, %p43, %p51;
|
| 198 |
+
and.pred %p63, %p9, %p61;
|
| 199 |
+
and.pred %p64, %p9, %p59;
|
| 200 |
+
and.pred %p65, %p9, %p57;
|
| 201 |
+
and.pred %p66, %p9, %p55;
|
| 202 |
+
.loc 1 80 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:80:24
|
| 203 |
+
or.pred %p67, %p20, %p66;
|
| 204 |
+
or.pred %p68, %p19, %p65;
|
| 205 |
+
or.pred %p69, %p18, %p64;
|
| 206 |
+
or.pred %p70, %p17, %p63;
|
| 207 |
+
.loc 1 86 50 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:86:50
|
| 208 |
+
and.pred %p75, %p24, %p70;
|
| 209 |
+
and.pred %p76, %p25, %p69;
|
| 210 |
+
and.pred %p77, %p26, %p68;
|
| 211 |
+
and.pred %p78, %p27, %p67;
|
| 212 |
+
selp.b64 %rd147, 1, 0, %p78;
|
| 213 |
+
selp.b64 %rd148, 1, 0, %p77;
|
| 214 |
+
selp.b64 %rd149, 1, 0, %p76;
|
| 215 |
+
selp.b64 %rd150, 1, 0, %p75;
|
| 216 |
+
add.s64 %rd211, %rd211, %rd150;
|
| 217 |
+
add.s64 %rd212, %rd212, %rd149;
|
| 218 |
+
add.s64 %rd213, %rd213, %rd148;
|
| 219 |
+
add.s64 %rd214, %rd214, %rd147;
|
| 220 |
+
.loc 1 32 40 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:32:40
|
| 221 |
+
add.s64 %rd210, %rd210, 2048;
|
| 222 |
+
add.s32 %r76, %r76, 16;
|
| 223 |
+
add.s64 %rd209, %rd209, 16;
|
| 224 |
+
add.s64 %rd208, %rd208, -16;
|
| 225 |
+
setp.lt.u64 %p79, %rd210, 14336;
|
| 226 |
+
@%p79 bra $L__BB0_10;
|
| 227 |
+
bra.uni $L__BB0_14;
|
| 228 |
+
$L__BB0_10: // =>This Inner Loop Header: Depth=1
|
| 229 |
+
.loc 1 37 27 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:37:27
|
| 230 |
+
or.b32 %r20, %r76, 4;
|
| 231 |
+
add.s64 %rd104, %rd209, %rd16;
|
| 232 |
+
or.b32 %r21, %r76, 12;
|
| 233 |
+
.loc 1 39 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:39:22
|
| 234 |
+
cvt.u64.u32 %rd105, %r20;
|
| 235 |
+
cvt.u64.u32 %rd106, %r21;
|
| 236 |
+
.loc 1 49 77 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:77
|
| 237 |
+
// begin inline asm
|
| 238 |
+
mov.u64 %rd88, 0x0;
|
| 239 |
+
createpolicy.fractional.L2::evict_last.b64 %rd88, 1.0;
|
| 240 |
+
// end inline asm
|
| 241 |
+
.loc 1 39 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:39:22
|
| 242 |
+
or.b64 %rd107, %rd16, %rd106;
|
| 243 |
+
add.s64 %rd108, %rd104, 8;
|
| 244 |
+
or.b64 %rd109, %rd16, %rd105;
|
| 245 |
+
or.b64 %rd110, %rd16, %rd209;
|
| 246 |
+
.loc 1 41 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:41:22
|
| 247 |
+
setp.lt.s64 %p29, %rd110, %rd66;
|
| 248 |
+
setp.lt.s64 %p30, %rd109, %rd66;
|
| 249 |
+
setp.lt.s64 %p31, %rd108, %rd66;
|
| 250 |
+
setp.lt.s64 %p32, %rd107, %rd66;
|
| 251 |
+
.loc 1 45 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:45:22
|
| 252 |
+
and.pred %p16, %p5, %p32;
|
| 253 |
+
and.pred %p15, %p5, %p31;
|
| 254 |
+
and.pred %p14, %p5, %p30;
|
| 255 |
+
and.pred %p13, %p5, %p29;
|
| 256 |
+
.loc 1 48 23 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:48:23
|
| 257 |
+
setp.ge.s64 %p33, %rd107, %rd24;
|
| 258 |
+
setp.ge.s64 %p34, %rd108, %rd24;
|
| 259 |
+
setp.ge.s64 %p35, %rd109, %rd24;
|
| 260 |
+
setp.ge.s64 %p36, %rd110, %rd24;
|
| 261 |
+
.loc 1 49 94 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:94
|
| 262 |
+
and.pred %p24, %p1, %p13;
|
| 263 |
+
and.pred %p25, %p1, %p14;
|
| 264 |
+
and.pred %p26, %p1, %p15;
|
| 265 |
+
and.pred %p27, %p1, %p16;
|
| 266 |
+
.loc 1 49 77 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:77
|
| 267 |
+
// begin inline asm
|
| 268 |
+
mov.u64 %rd89, 0x0;
|
| 269 |
+
@%p24 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd89 }, [ %rd90 + 0 ], %rd88;
|
| 270 |
+
// end inline asm
|
| 271 |
+
// begin inline asm
|
| 272 |
+
mov.u64 %rd92, 0x0;
|
| 273 |
+
createpolicy.fractional.L2::evict_last.b64 %rd92, 1.0;
|
| 274 |
+
// end inline asm
|
| 275 |
+
// begin inline asm
|
| 276 |
+
mov.u64 %rd93, 0x0;
|
| 277 |
+
@%p25 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd93 }, [ %rd90 + 0 ], %rd92;
|
| 278 |
+
// end inline asm
|
| 279 |
+
// begin inline asm
|
| 280 |
+
mov.u64 %rd96, 0x0;
|
| 281 |
+
createpolicy.fractional.L2::evict_last.b64 %rd96, 1.0;
|
| 282 |
+
// end inline asm
|
| 283 |
+
// begin inline asm
|
| 284 |
+
mov.u64 %rd97, 0x0;
|
| 285 |
+
@%p26 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd97 }, [ %rd90 + 0 ], %rd96;
|
| 286 |
+
// end inline asm
|
| 287 |
+
// begin inline asm
|
| 288 |
+
mov.u64 %rd100, 0x0;
|
| 289 |
+
createpolicy.fractional.L2::evict_last.b64 %rd100, 1.0;
|
| 290 |
+
// end inline asm
|
| 291 |
+
// begin inline asm
|
| 292 |
+
mov.u64 %rd101, 0x0;
|
| 293 |
+
@%p27 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd101 }, [ %rd90 + 0 ], %rd100;
|
| 294 |
+
// end inline asm
|
| 295 |
+
.loc 1 52 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:52:24
|
| 296 |
+
max.s64 %rd111, %rd24, %rd107;
|
| 297 |
+
setp.lt.s64 %p37, %rd111, %rd101;
|
| 298 |
+
max.s64 %rd112, %rd24, %rd108;
|
| 299 |
+
setp.lt.s64 %p38, %rd112, %rd97;
|
| 300 |
+
max.s64 %rd113, %rd24, %rd109;
|
| 301 |
+
setp.lt.s64 %p39, %rd113, %rd93;
|
| 302 |
+
max.s64 %rd114, %rd24, %rd110;
|
| 303 |
+
setp.lt.s64 %p40, %rd114, %rd89;
|
| 304 |
+
.loc 1 53 23 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:53:23
|
| 305 |
+
and.pred %p17, %p36, %p40;
|
| 306 |
+
and.pred %p18, %p35, %p39;
|
| 307 |
+
and.pred %p19, %p34, %p38;
|
| 308 |
+
and.pred %p20, %p33, %p37;
|
| 309 |
+
.loc 1 58 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:58:24
|
| 310 |
+
or.b64 %rd115, %rd24, %rd69;
|
| 311 |
+
and.b64 %rd116, %rd115, -4294967296;
|
| 312 |
+
setp.ne.b64 %p41, %rd116, 0;
|
| 313 |
+
@%p41 bra $L__BB0_12;
|
| 314 |
+
// %bb.11: // in Loop: Header=BB0_10 Depth=1
|
| 315 |
+
cvt.u32.u64 %r24, %rd69;
|
| 316 |
+
cvt.u32.u64 %r25, %rd24;
|
| 317 |
+
rem.u32 %r26, %r25, %r24;
|
| 318 |
+
cvt.u64.u32 %rd215, %r26;
|
| 319 |
+
bra.uni $L__BB0_13;
|
| 320 |
+
$L__BB0_14:
|
| 321 |
+
.loc 1 25 37 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:25:37
|
| 322 |
+
and.b32 %r33, %r1, 31;
|
| 323 |
+
$L__tmp1:
|
| 324 |
+
.loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 325 |
+
add.s64 %rd156, %rd211, %rd213;
|
| 326 |
+
add.s64 %rd157, %rd212, %rd214;
|
| 327 |
+
add.s64 %rd158, %rd156, %rd157;
|
| 328 |
+
.loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 329 |
+
mov.b64 {_, %r34}, %rd158;
|
| 330 |
+
cvt.u32.u64 %r35, %rd158;
|
| 331 |
+
shfl.sync.bfly.b32 %r36, %r35, 16, 31, -1;
|
| 332 |
+
shfl.sync.bfly.b32 %r37, %r34, 16, 31, -1;
|
| 333 |
+
cvt.u64.u32 %rd159, %r36;
|
| 334 |
+
cvt.u64.u32 %rd160, %r37;
|
| 335 |
+
shl.b64 %rd161, %rd160, 32;
|
| 336 |
+
or.b64 %rd162, %rd159, %rd161;
|
| 337 |
+
.loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 338 |
+
add.s64 %rd163, %rd158, %rd162;
|
| 339 |
+
.loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 340 |
+
mov.b64 {_, %r38}, %rd163;
|
| 341 |
+
cvt.u32.u64 %r39, %rd163;
|
| 342 |
+
shfl.sync.bfly.b32 %r40, %r39, 8, 31, -1;
|
| 343 |
+
shfl.sync.bfly.b32 %r41, %r38, 8, 31, -1;
|
| 344 |
+
cvt.u64.u32 %rd164, %r40;
|
| 345 |
+
cvt.u64.u32 %rd165, %r41;
|
| 346 |
+
shl.b64 %rd166, %rd165, 32;
|
| 347 |
+
or.b64 %rd167, %rd164, %rd166;
|
| 348 |
+
.loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 349 |
+
add.s64 %rd168, %rd163, %rd167;
|
| 350 |
+
.loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 351 |
+
mov.b64 {_, %r42}, %rd168;
|
| 352 |
+
cvt.u32.u64 %r43, %rd168;
|
| 353 |
+
shfl.sync.bfly.b32 %r44, %r43, 4, 31, -1;
|
| 354 |
+
shfl.sync.bfly.b32 %r45, %r42, 4, 31, -1;
|
| 355 |
+
cvt.u64.u32 %rd169, %r44;
|
| 356 |
+
cvt.u64.u32 %rd170, %r45;
|
| 357 |
+
shl.b64 %rd171, %rd170, 32;
|
| 358 |
+
or.b64 %rd172, %rd169, %rd171;
|
| 359 |
+
.loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 360 |
+
add.s64 %rd173, %rd168, %rd172;
|
| 361 |
+
.loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 362 |
+
mov.b64 {_, %r46}, %rd173;
|
| 363 |
+
cvt.u32.u64 %r47, %rd173;
|
| 364 |
+
shfl.sync.bfly.b32 %r48, %r47, 2, 31, -1;
|
| 365 |
+
shfl.sync.bfly.b32 %r49, %r46, 2, 31, -1;
|
| 366 |
+
cvt.u64.u32 %rd174, %r48;
|
| 367 |
+
cvt.u64.u32 %rd175, %r49;
|
| 368 |
+
shl.b64 %rd176, %rd175, 32;
|
| 369 |
+
or.b64 %rd177, %rd174, %rd176;
|
| 370 |
+
.loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 371 |
+
add.s64 %rd178, %rd173, %rd177;
|
| 372 |
+
.loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 373 |
+
mov.b64 {_, %r50}, %rd178;
|
| 374 |
+
cvt.u32.u64 %r51, %rd178;
|
| 375 |
+
shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1;
|
| 376 |
+
shfl.sync.bfly.b32 %r53, %r50, 1, 31, -1;
|
| 377 |
+
cvt.u64.u32 %rd179, %r52;
|
| 378 |
+
cvt.u64.u32 %rd180, %r53;
|
| 379 |
+
shl.b64 %rd181, %rd180, 32;
|
| 380 |
+
or.b64 %rd182, %rd179, %rd181;
|
| 381 |
+
.loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 382 |
+
add.s64 %rd151, %rd178, %rd182;
|
| 383 |
+
.loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 384 |
+
setp.eq.b32 %p80, %r33, 0;
|
| 385 |
+
shr.u32 %r54, %r1, 2;
|
| 386 |
+
and.b32 %r55, %r54, 120;
|
| 387 |
+
mov.b32 %r56, global_smem;
|
| 388 |
+
add.s32 %r27, %r56, %r55;
|
| 389 |
+
// begin inline asm
|
| 390 |
+
@%p80 st.shared.b64 [ %r27 + 0 ], %rd151;
|
| 391 |
+
// end inline asm
|
| 392 |
+
bar.sync 0;
|
| 393 |
+
setp.lt.u32 %p81, %r1, 16;
|
| 394 |
+
shl.b32 %r57, %r1, 3;
|
| 395 |
+
add.s32 %r28, %r56, %r57;
|
| 396 |
+
// begin inline asm
|
| 397 |
+
@%p81 ld.shared.b64 %rd152, [ %r28 + 0 ];
|
| 398 |
+
// end inline asm
|
| 399 |
+
mov.b64 {_, %r58}, %rd152;
|
| 400 |
+
cvt.u32.u64 %r59, %rd152;
|
| 401 |
+
shfl.sync.bfly.b32 %r60, %r59, 8, 31, -1;
|
| 402 |
+
shfl.sync.bfly.b32 %r61, %r58, 8, 31, -1;
|
| 403 |
+
cvt.u64.u32 %rd183, %r60;
|
| 404 |
+
cvt.u64.u32 %rd184, %r61;
|
| 405 |
+
shl.b64 %rd185, %rd184, 32;
|
| 406 |
+
or.b64 %rd186, %rd183, %rd185;
|
| 407 |
+
.loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 408 |
+
add.s64 %rd187, %rd152, %rd186;
|
| 409 |
+
.loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 410 |
+
mov.b64 {_, %r62}, %rd187;
|
| 411 |
+
cvt.u32.u64 %r63, %rd187;
|
| 412 |
+
shfl.sync.bfly.b32 %r64, %r63, 4, 31, -1;
|
| 413 |
+
shfl.sync.bfly.b32 %r65, %r62, 4, 31, -1;
|
| 414 |
+
cvt.u64.u32 %rd188, %r64;
|
| 415 |
+
cvt.u64.u32 %rd189, %r65;
|
| 416 |
+
shl.b64 %rd190, %rd189, 32;
|
| 417 |
+
or.b64 %rd191, %rd188, %rd190;
|
| 418 |
+
.loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 419 |
+
add.s64 %rd192, %rd187, %rd191;
|
| 420 |
+
.loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 421 |
+
mov.b64 {_, %r66}, %rd192;
|
| 422 |
+
cvt.u32.u64 %r67, %rd192;
|
| 423 |
+
shfl.sync.bfly.b32 %r68, %r67, 2, 31, -1;
|
| 424 |
+
shfl.sync.bfly.b32 %r69, %r66, 2, 31, -1;
|
| 425 |
+
cvt.u64.u32 %rd193, %r68;
|
| 426 |
+
cvt.u64.u32 %rd194, %r69;
|
| 427 |
+
shl.b64 %rd195, %rd194, 32;
|
| 428 |
+
or.b64 %rd196, %rd193, %rd195;
|
| 429 |
+
.loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 430 |
+
add.s64 %rd197, %rd192, %rd196;
|
| 431 |
+
.loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 432 |
+
mov.b64 {_, %r70}, %rd197;
|
| 433 |
+
cvt.u32.u64 %r71, %rd197;
|
| 434 |
+
shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1;
|
| 435 |
+
shfl.sync.bfly.b32 %r73, %r70, 1, 31, -1;
|
| 436 |
+
cvt.u64.u32 %rd198, %r72;
|
| 437 |
+
cvt.u64.u32 %rd199, %r73;
|
| 438 |
+
shl.b64 %rd200, %rd199, 32;
|
| 439 |
+
or.b64 %rd201, %rd198, %rd200;
|
| 440 |
+
.loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 441 |
+
add.s64 %rd153, %rd197, %rd201;
|
| 442 |
+
.loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ]
|
| 443 |
+
setp.eq.b32 %p82, %r1, 0;
|
| 444 |
+
// begin inline asm
|
| 445 |
+
@%p82 st.shared.b64 [ %r28 + 0 ], %rd153;
|
| 446 |
+
// end inline asm
|
| 447 |
+
bar.sync 0;
|
| 448 |
+
ld.shared.b64 %rd202, [global_smem];
|
| 449 |
+
$L__tmp2:
|
| 450 |
+
.loc 1 92 20 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:92:20
|
| 451 |
+
add.s64 %rd203, %rd202, -1;
|
| 452 |
+
setp.lt.u64 %p86, %rd203, 16383;
|
| 453 |
+
.loc 1 0 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0
|
| 454 |
+
selp.b32 %r30, 1, 0, %p86;
|
| 455 |
+
.loc 1 95 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:95:21
|
| 456 |
+
setp.eq.b64 %p87, %rd202, 16384;
|
| 457 |
+
.loc 1 0 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0
|
| 458 |
+
selp.b32 %r31, 1, 0, %p87;
|
| 459 |
+
.loc 1 98 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:98:25
|
| 460 |
+
shl.b64 %rd204, %rd1, 2;
|
| 461 |
+
add.s64 %rd154, %rd63, %rd204;
|
| 462 |
+
.loc 1 98 37 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:98:37
|
| 463 |
+
setp.eq.b32 %p88, %r2, 0;
|
| 464 |
+
and.pred %p83, %p88, %p1;
|
| 465 |
+
// begin inline asm
|
| 466 |
+
@%p83 st.global.b32 [ %rd154 + 0 ], { %r30 };
|
| 467 |
+
// end inline asm
|
| 468 |
+
.loc 1 99 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:99:25
|
| 469 |
+
add.s64 %rd155, %rd64, %rd204;
|
| 470 |
+
.loc 1 99 37 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:99:37
|
| 471 |
+
// begin inline asm
|
| 472 |
+
@%p83 st.global.b32 [ %rd155 + 0 ], { %r31 };
|
| 473 |
+
// end inline asm
|
| 474 |
+
.loc 1 99 4 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:99:4
|
| 475 |
+
ret;
|
| 476 |
+
$L__tmp3:
|
| 477 |
+
$L__func_end0:
|
| 478 |
+
// -- End function
|
| 479 |
+
}
|
| 480 |
+
.file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py"
|
| 481 |
+
.file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py"
|
| 482 |
+
.section .debug_abbrev
|
| 483 |
+
{
|
| 484 |
+
.b8 1 // Abbreviation Code
|
| 485 |
+
.b8 17 // DW_TAG_compile_unit
|
| 486 |
+
.b8 1 // DW_CHILDREN_yes
|
| 487 |
+
.b8 37 // DW_AT_producer
|
| 488 |
+
.b8 8 // DW_FORM_string
|
| 489 |
+
.b8 19 // DW_AT_language
|
| 490 |
+
.b8 5 // DW_FORM_data2
|
| 491 |
+
.b8 3 // DW_AT_name
|
| 492 |
+
.b8 8 // DW_FORM_string
|
| 493 |
+
.b8 16 // DW_AT_stmt_list
|
| 494 |
+
.b8 6 // DW_FORM_data4
|
| 495 |
+
.b8 27 // DW_AT_comp_dir
|
| 496 |
+
.b8 8 // DW_FORM_string
|
| 497 |
+
.b8 0 // EOM(1)
|
| 498 |
+
.b8 0 // EOM(2)
|
| 499 |
+
.b8 2 // Abbreviation Code
|
| 500 |
+
.b8 46 // DW_TAG_subprogram
|
| 501 |
+
.b8 0 // DW_CHILDREN_no
|
| 502 |
+
.b8 3 // DW_AT_name
|
| 503 |
+
.b8 8 // DW_FORM_string
|
| 504 |
+
.b8 32 // DW_AT_inline
|
| 505 |
+
.b8 11 // DW_FORM_data1
|
| 506 |
+
.b8 0 // EOM(1)
|
| 507 |
+
.b8 0 // EOM(2)
|
| 508 |
+
.b8 3 // Abbreviation Code
|
| 509 |
+
.b8 46 // DW_TAG_subprogram
|
| 510 |
+
.b8 1 // DW_CHILDREN_yes
|
| 511 |
+
.b8 17 // DW_AT_low_pc
|
| 512 |
+
.b8 1 // DW_FORM_addr
|
| 513 |
+
.b8 18 // DW_AT_high_pc
|
| 514 |
+
.b8 1 // DW_FORM_addr
|
| 515 |
+
.b8 49 // DW_AT_abstract_origin
|
| 516 |
+
.b8 19 // DW_FORM_ref4
|
| 517 |
+
.b8 0 // EOM(1)
|
| 518 |
+
.b8 0 // EOM(2)
|
| 519 |
+
.b8 4 // Abbreviation Code
|
| 520 |
+
.b8 29 // DW_TAG_inlined_subroutine
|
| 521 |
+
.b8 0 // DW_CHILDREN_no
|
| 522 |
+
.b8 49 // DW_AT_abstract_origin
|
| 523 |
+
.b8 19 // DW_FORM_ref4
|
| 524 |
+
.b8 17 // DW_AT_low_pc
|
| 525 |
+
.b8 1 // DW_FORM_addr
|
| 526 |
+
.b8 18 // DW_AT_high_pc
|
| 527 |
+
.b8 1 // DW_FORM_addr
|
| 528 |
+
.b8 88 // DW_AT_call_file
|
| 529 |
+
.b8 11 // DW_FORM_data1
|
| 530 |
+
.b8 89 // DW_AT_call_line
|
| 531 |
+
.b8 11 // DW_FORM_data1
|
| 532 |
+
.b8 87 // DW_AT_call_column
|
| 533 |
+
.b8 11 // DW_FORM_data1
|
| 534 |
+
.b8 0 // EOM(1)
|
| 535 |
+
.b8 0 // EOM(2)
|
| 536 |
+
.b8 0 // EOM(3)
|
| 537 |
+
}
|
| 538 |
+
.section .debug_info
|
| 539 |
+
{
|
| 540 |
+
.b32 307 // Length of Unit
|
| 541 |
+
.b8 2 // DWARF version number
|
| 542 |
+
.b8 0
|
| 543 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 544 |
+
.b8 8 // Address Size (in bytes)
|
| 545 |
+
.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit
|
| 546 |
+
.b8 116 // DW_AT_producer
|
| 547 |
+
.b8 114
|
| 548 |
+
.b8 105
|
| 549 |
+
.b8 116
|
| 550 |
+
.b8 111
|
| 551 |
+
.b8 110
|
| 552 |
+
.b8 0
|
| 553 |
+
.b8 2 // DW_AT_language
|
| 554 |
+
.b8 0
|
| 555 |
+
.b8 99 // DW_AT_name
|
| 556 |
+
.b8 97
|
| 557 |
+
.b8 118
|
| 558 |
+
.b8 112
|
| 559 |
+
.b8 55
|
| 560 |
+
.b8 120
|
| 561 |
+
.b8 97
|
| 562 |
+
.b8 110
|
| 563 |
+
.b8 55
|
| 564 |
+
.b8 55
|
| 565 |
+
.b8 116
|
| 566 |
+
.b8 102
|
| 567 |
+
.b8 114
|
| 568 |
+
.b8 55
|
| 569 |
+
.b8 113
|
| 570 |
+
.b8 121
|
| 571 |
+
.b8 116
|
| 572 |
+
.b8 102
|
| 573 |
+
.b8 107
|
| 574 |
+
.b8 112
|
| 575 |
+
.b8 54
|
| 576 |
+
.b8 115
|
| 577 |
+
.b8 106
|
| 578 |
+
.b8 114
|
| 579 |
+
.b8 103
|
| 580 |
+
.b8 107
|
| 581 |
+
.b8 100
|
| 582 |
+
.b8 54
|
| 583 |
+
.b8 104
|
| 584 |
+
.b8 118
|
| 585 |
+
.b8 114
|
| 586 |
+
.b8 117
|
| 587 |
+
.b8 105
|
| 588 |
+
.b8 97
|
| 589 |
+
.b8 113
|
| 590 |
+
.b8 102
|
| 591 |
+
.b8 122
|
| 592 |
+
.b8 107
|
| 593 |
+
.b8 101
|
| 594 |
+
.b8 105
|
| 595 |
+
.b8 98
|
| 596 |
+
.b8 116
|
| 597 |
+
.b8 108
|
| 598 |
+
.b8 53
|
| 599 |
+
.b8 114
|
| 600 |
+
.b8 116
|
| 601 |
+
.b8 97
|
| 602 |
+
.b8 103
|
| 603 |
+
.b8 115
|
| 604 |
+
.b8 99
|
| 605 |
+
.b8 110
|
| 606 |
+
.b8 103
|
| 607 |
+
.b8 46
|
| 608 |
+
.b8 112
|
| 609 |
+
.b8 121
|
| 610 |
+
.b8 0
|
| 611 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 612 |
+
.b8 47 // DW_AT_comp_dir
|
| 613 |
+
.b8 119
|
| 614 |
+
.b8 111
|
| 615 |
+
.b8 114
|
| 616 |
+
.b8 107
|
| 617 |
+
.b8 115
|
| 618 |
+
.b8 112
|
| 619 |
+
.b8 97
|
| 620 |
+
.b8 99
|
| 621 |
+
.b8 101
|
| 622 |
+
.b8 47
|
| 623 |
+
.b8 104
|
| 624 |
+
.b8 97
|
| 625 |
+
.b8 110
|
| 626 |
+
.b8 114
|
| 627 |
+
.b8 117
|
| 628 |
+
.b8 105
|
| 629 |
+
.b8 47
|
| 630 |
+
.b8 83
|
| 631 |
+
.b8 112
|
| 632 |
+
.b8 101
|
| 633 |
+
.b8 99
|
| 634 |
+
.b8 70
|
| 635 |
+
.b8 111
|
| 636 |
+
.b8 114
|
| 637 |
+
.b8 103
|
| 638 |
+
.b8 101
|
| 639 |
+
.b8 45
|
| 640 |
+
.b8 101
|
| 641 |
+
.b8 120
|
| 642 |
+
.b8 116
|
| 643 |
+
.b8 47
|
| 644 |
+
.b8 99
|
| 645 |
+
.b8 97
|
| 646 |
+
.b8 99
|
| 647 |
+
.b8 104
|
| 648 |
+
.b8 101
|
| 649 |
+
.b8 47
|
| 650 |
+
.b8 99
|
| 651 |
+
.b8 111
|
| 652 |
+
.b8 109
|
| 653 |
+
.b8 112
|
| 654 |
+
.b8 105
|
| 655 |
+
.b8 108
|
| 656 |
+
.b8 101
|
| 657 |
+
.b8 100
|
| 658 |
+
.b8 95
|
| 659 |
+
.b8 107
|
| 660 |
+
.b8 101
|
| 661 |
+
.b8 114
|
| 662 |
+
.b8 110
|
| 663 |
+
.b8 101
|
| 664 |
+
.b8 108
|
| 665 |
+
.b8 115
|
| 666 |
+
.b8 47
|
| 667 |
+
.b8 97
|
| 668 |
+
.b8 118
|
| 669 |
+
.b8 0
|
| 670 |
+
.b8 2 // Abbrev [2] 0x8b:0x7d DW_TAG_subprogram
|
| 671 |
+
.b8 116 // DW_AT_name
|
| 672 |
+
.b8 114
|
| 673 |
+
.b8 105
|
| 674 |
+
.b8 116
|
| 675 |
+
.b8 111
|
| 676 |
+
.b8 110
|
| 677 |
+
.b8 95
|
| 678 |
+
.b8 114
|
| 679 |
+
.b8 101
|
| 680 |
+
.b8 100
|
| 681 |
+
.b8 95
|
| 682 |
+
.b8 102
|
| 683 |
+
.b8 117
|
| 684 |
+
.b8 115
|
| 685 |
+
.b8 101
|
| 686 |
+
.b8 100
|
| 687 |
+
.b8 95
|
| 688 |
+
.b8 95
|
| 689 |
+
.b8 116
|
| 690 |
+
.b8 111
|
| 691 |
+
.b8 95
|
| 692 |
+
.b8 99
|
| 693 |
+
.b8 111
|
| 694 |
+
.b8 112
|
| 695 |
+
.b8 121
|
| 696 |
+
.b8 95
|
| 697 |
+
.b8 97
|
| 698 |
+
.b8 114
|
| 699 |
+
.b8 97
|
| 700 |
+
.b8 110
|
| 701 |
+
.b8 103
|
| 702 |
+
.b8 101
|
| 703 |
+
.b8 95
|
| 704 |
+
.b8 98
|
| 705 |
+
.b8 105
|
| 706 |
+
.b8 116
|
| 707 |
+
.b8 119
|
| 708 |
+
.b8 105
|
| 709 |
+
.b8 115
|
| 710 |
+
.b8 101
|
| 711 |
+
.b8 95
|
| 712 |
+
.b8 97
|
| 713 |
+
.b8 110
|
| 714 |
+
.b8 100
|
| 715 |
+
.b8 95
|
| 716 |
+
.b8 98
|
| 717 |
+
.b8 105
|
| 718 |
+
.b8 116
|
| 719 |
+
.b8 119
|
| 720 |
+
.b8 105
|
| 721 |
+
.b8 115
|
| 722 |
+
.b8 101
|
| 723 |
+
.b8 95
|
| 724 |
+
.b8 111
|
| 725 |
+
.b8 114
|
| 726 |
+
.b8 95
|
| 727 |
+
.b8 99
|
| 728 |
+
.b8 111
|
| 729 |
+
.b8 110
|
| 730 |
+
.b8 115
|
| 731 |
+
.b8 116
|
| 732 |
+
.b8 97
|
| 733 |
+
.b8 110
|
| 734 |
+
.b8 116
|
| 735 |
+
.b8 95
|
| 736 |
+
.b8 112
|
| 737 |
+
.b8 97
|
| 738 |
+
.b8 100
|
| 739 |
+
.b8 95
|
| 740 |
+
.b8 110
|
| 741 |
+
.b8 100
|
| 742 |
+
.b8 95
|
| 743 |
+
.b8 101
|
| 744 |
+
.b8 113
|
| 745 |
+
.b8 95
|
| 746 |
+
.b8 103
|
| 747 |
+
.b8 101
|
| 748 |
+
.b8 95
|
| 749 |
+
.b8 103
|
| 750 |
+
.b8 116
|
| 751 |
+
.b8 95
|
| 752 |
+
.b8 105
|
| 753 |
+
.b8 110
|
| 754 |
+
.b8 100
|
| 755 |
+
.b8 101
|
| 756 |
+
.b8 120
|
| 757 |
+
.b8 95
|
| 758 |
+
.b8 108
|
| 759 |
+
.b8 116
|
| 760 |
+
.b8 95
|
| 761 |
+
.b8 112
|
| 762 |
+
.b8 101
|
| 763 |
+
.b8 114
|
| 764 |
+
.b8 109
|
| 765 |
+
.b8 117
|
| 766 |
+
.b8 116
|
| 767 |
+
.b8 101
|
| 768 |
+
.b8 95
|
| 769 |
+
.b8 114
|
| 770 |
+
.b8 101
|
| 771 |
+
.b8 109
|
| 772 |
+
.b8 97
|
| 773 |
+
.b8 105
|
| 774 |
+
.b8 110
|
| 775 |
+
.b8 100
|
| 776 |
+
.b8 101
|
| 777 |
+
.b8 114
|
| 778 |
+
.b8 95
|
| 779 |
+
.b8 115
|
| 780 |
+
.b8 117
|
| 781 |
+
.b8 98
|
| 782 |
+
.b8 95
|
| 783 |
+
.b8 115
|
| 784 |
+
.b8 117
|
| 785 |
+
.b8 109
|
| 786 |
+
.b8 95
|
| 787 |
+
.b8 118
|
| 788 |
+
.b8 105
|
| 789 |
+
.b8 101
|
| 790 |
+
.b8 119
|
| 791 |
+
.b8 95
|
| 792 |
+
.b8 49
|
| 793 |
+
.b8 0
|
| 794 |
+
.b8 1 // DW_AT_inline
|
| 795 |
+
.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram
|
| 796 |
+
.b64 $L__func_begin0 // DW_AT_low_pc
|
| 797 |
+
.b64 $L__func_end0 // DW_AT_high_pc
|
| 798 |
+
.b32 139 // DW_AT_abstract_origin
|
| 799 |
+
.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine
|
| 800 |
+
.b32 139 // DW_AT_abstract_origin
|
| 801 |
+
.b64 $L__tmp1 // DW_AT_low_pc
|
| 802 |
+
.b64 $L__tmp2 // DW_AT_high_pc
|
| 803 |
+
.b8 1 // DW_AT_call_file
|
| 804 |
+
.b8 87 // DW_AT_call_line
|
| 805 |
+
.b8 27 // DW_AT_call_column
|
| 806 |
+
.b8 0 // End Of Children Mark
|
| 807 |
+
.b8 0 // End Of Children Mark
|
| 808 |
+
}
|
| 809 |
+
.section .debug_macinfo { }
|
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":18:0)
|
| 2 |
+
#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
|
| 3 |
+
#loc99 = loc(unknown)
|
| 4 |
+
#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
|
| 5 |
+
#loc106 = loc("in_ptr0"(#loc))
|
| 6 |
+
#loc107 = loc("out_ptr1"(#loc))
|
| 7 |
+
#loc108 = loc("out_ptr2"(#loc))
|
| 8 |
+
#loc109 = loc("ks0"(#loc))
|
| 9 |
+
#loc110 = loc("ks1"(#loc))
|
| 10 |
+
#loc111 = loc("ks2"(#loc))
|
| 11 |
+
#loc112 = loc("ks3"(#loc))
|
| 12 |
+
#loc113 = loc("ks4"(#loc))
|
| 13 |
+
#loc114 = loc("ks5"(#loc))
|
| 14 |
+
#loc115 = loc("xnumel"(#loc))
|
| 15 |
+
#loc116 = loc("r0_numel"(#loc))
|
| 16 |
+
#loc207 = loc("input"(#loc97))
|
| 17 |
+
#loc208 = loc("a"(#loc102))
|
| 18 |
+
#loc209 = loc("b"(#loc102))
|
| 19 |
+
module {
|
| 20 |
+
tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 21 |
+
%r0_numel_0 = arith.constant 16384 : i32 loc(#loc117)
|
| 22 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc118)
|
| 23 |
+
%xoffset_1 = arith.constant 1 : i32 loc(#loc119)
|
| 24 |
+
%xoffset_2 = arith.constant 1 : i32 loc(#loc119)
|
| 25 |
+
%xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc119)
|
| 26 |
+
%xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc120)
|
| 27 |
+
%xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc121)
|
| 28 |
+
%xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc122)
|
| 29 |
+
%xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc122)
|
| 30 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc123)
|
| 31 |
+
%xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc123)
|
| 32 |
+
%r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc124)
|
| 33 |
+
%r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc125)
|
| 34 |
+
%x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc126)
|
| 35 |
+
%x1_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc126)
|
| 36 |
+
%x1_10 = arith.divsi %x1, %x1_9 : tensor<1x1xi64> loc(#loc126)
|
| 37 |
+
%x1_11 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc127)
|
| 38 |
+
%x1_12 = arith.remsi %x1_10, %x1_11 : tensor<1x1xi64> loc(#loc127)
|
| 39 |
+
%x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc128)
|
| 40 |
+
%x0_13 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc128)
|
| 41 |
+
%x0_14 = arith.remsi %x0, %x0_13 : tensor<1x1xi64> loc(#loc128)
|
| 42 |
+
%x2 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc129)
|
| 43 |
+
%x2_15 = tt.splat %ks4 : i64 -> tensor<1x1xi64> loc(#loc129)
|
| 44 |
+
%x2_16 = arith.divsi %x2, %x2_15 : tensor<1x1xi64> loc(#loc129)
|
| 45 |
+
%_tmp46 = arith.constant 0 : i64 loc(#loc130)
|
| 46 |
+
%_tmp46_17 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc130)
|
| 47 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc15)
|
| 48 |
+
%c2048_i32 = arith.constant 2048 : i32 loc(#loc15)
|
| 49 |
+
%0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
|
| 50 |
+
%1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc15)
|
| 51 |
+
%2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc15)
|
| 52 |
+
%3 = ub.poison : i32 loc(#loc15)
|
| 53 |
+
%_tmp46_18 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp46_22 = %_tmp46_17) -> (tensor<1x2048xi64>) : i32 {
|
| 54 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc132)
|
| 55 |
+
%r0_index_23 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc132)
|
| 56 |
+
%r0_mask = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc133)
|
| 57 |
+
%r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x2048xi32> loc(#loc133)
|
| 58 |
+
%r0_4 = arith.constant 128 : i32 loc(#loc134)
|
| 59 |
+
%r0_4_25 = arith.constant 128 : i32 loc(#loc134)
|
| 60 |
+
%r0_4_26 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc134)
|
| 61 |
+
%r0_4_27 = arith.divsi %r0_index_23, %r0_4_26 : tensor<1x2048xi32> loc(#loc134)
|
| 62 |
+
%r0_3 = arith.constant 128 : i32 loc(#loc135)
|
| 63 |
+
%r0_3_28 = arith.constant 128 : i32 loc(#loc135)
|
| 64 |
+
%r0_3_29 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc135)
|
| 65 |
+
%r0_3_30 = arith.remsi %r0_index_23, %r0_3_29 : tensor<1x2048xi32> loc(#loc135)
|
| 66 |
+
%tmp0 = arith.constant 128 : i32 loc(#loc136)
|
| 67 |
+
%tmp0_31 = arith.constant 128 : i64 loc(#loc136)
|
| 68 |
+
%tmp0_32 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc136)
|
| 69 |
+
%tmp0_33 = arith.muli %tmp0_32, %x1_12 : tensor<1x1xi64> loc(#loc136)
|
| 70 |
+
%tmp0_34 = arith.extsi %r0_4_27 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc137)
|
| 71 |
+
%tmp0_35 = tt.broadcast %tmp0_33 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc137)
|
| 72 |
+
%tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<1x2048xi64> loc(#loc137)
|
| 73 |
+
%tmp2 = tt.splat %ks2 : i64 -> tensor<1x2048xi64> loc(#loc138)
|
| 74 |
+
%tmp2_37 = arith.cmpi slt, %tmp0_36, %tmp2 : tensor<1x2048xi64> loc(#loc138)
|
| 75 |
+
%tmp3 = arith.constant 128 : i32 loc(#loc139)
|
| 76 |
+
%tmp3_38 = arith.constant 128 : i64 loc(#loc139)
|
| 77 |
+
%tmp3_39 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc139)
|
| 78 |
+
%tmp3_40 = arith.muli %tmp3_39, %x0_14 : tensor<1x1xi64> loc(#loc139)
|
| 79 |
+
%tmp3_41 = arith.extsi %r0_3_30 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc140)
|
| 80 |
+
%tmp3_42 = tt.broadcast %tmp3_40 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc140)
|
| 81 |
+
%tmp3_43 = arith.addi %tmp3_41, %tmp3_42 : tensor<1x2048xi64> loc(#loc140)
|
| 82 |
+
%tmp5 = tt.splat %ks3 : i64 -> tensor<1x2048xi64> loc(#loc141)
|
| 83 |
+
%tmp5_44 = arith.cmpi slt, %tmp3_43, %tmp5 : tensor<1x2048xi64> loc(#loc141)
|
| 84 |
+
%tmp6 = arith.andi %tmp2_37, %tmp5_44 : tensor<1x2048xi1> loc(#loc142)
|
| 85 |
+
%tmp7 = arith.constant 128 : i32 loc(#loc143)
|
| 86 |
+
%tmp7_45 = arith.constant 128 : i64 loc(#loc143)
|
| 87 |
+
%tmp7_46 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc143)
|
| 88 |
+
%tmp7_47 = arith.muli %tmp7_46, %x1_12 : tensor<1x1xi64> loc(#loc143)
|
| 89 |
+
%tmp7_48 = arith.extsi %r0_4_27 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc144)
|
| 90 |
+
%tmp7_49 = tt.broadcast %tmp7_47 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc144)
|
| 91 |
+
%tmp7_50 = arith.addi %tmp7_48, %tmp7_49 : tensor<1x2048xi64> loc(#loc144)
|
| 92 |
+
%tmp8 = arith.constant 128 : i32 loc(#loc145)
|
| 93 |
+
%tmp8_51 = arith.constant 128 : i64 loc(#loc145)
|
| 94 |
+
%tmp8_52 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc145)
|
| 95 |
+
%tmp8_53 = arith.muli %tmp8_52, %x0_14 : tensor<1x1xi64> loc(#loc145)
|
| 96 |
+
%tmp8_54 = arith.extsi %r0_3_30 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc146)
|
| 97 |
+
%tmp8_55 = tt.broadcast %tmp8_53 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc146)
|
| 98 |
+
%tmp8_56 = arith.addi %tmp8_54, %tmp8_55 : tensor<1x2048xi64> loc(#loc146)
|
| 99 |
+
%tmp9 = arith.cmpi sge, %tmp7_50, %tmp8_56 : tensor<1x2048xi64> loc(#loc147)
|
| 100 |
+
%tmp10 = tt.broadcast %x2_16 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc148)
|
| 101 |
+
%tmp10_57 = tt.splat %in_ptr0 : !tt.ptr<i64> -> tensor<1x2048x!tt.ptr<i64>> loc(#loc149)
|
| 102 |
+
%tmp10_58 = tt.addptr %tmp10_57, %tmp10 : tensor<1x2048x!tt.ptr<i64>>, tensor<1x2048xi64> loc(#loc149)
|
| 103 |
+
%tmp10_59 = arith.andi %r0_mask_24, %tmp6 : tensor<1x2048xi1> loc(#loc150)
|
| 104 |
+
%tmp10_60 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc151)
|
| 105 |
+
%tmp10_61 = arith.andi %tmp10_59, %tmp10_60 : tensor<1x2048xi1> loc(#loc151)
|
| 106 |
+
%tmp10_62 = arith.constant 0.000000e+00 : f32 loc(#loc152)
|
| 107 |
+
%tmp10_63 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc152)
|
| 108 |
+
%tmp10_64 = arith.fptosi %tmp10_63 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc152)
|
| 109 |
+
%tmp10_65 = tt.load %tmp10_58, %tmp10_61, %tmp10_64 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<i64>> loc(#loc152)
|
| 110 |
+
%tmp11 = arith.cmpi slt, %tmp8_56, %tmp10_65 : tensor<1x2048xi64> loc(#loc153)
|
| 111 |
+
%tmp12 = arith.cmpi slt, %tmp7_50, %tmp10_65 : tensor<1x2048xi64> loc(#loc154)
|
| 112 |
+
%tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x2048xi1> loc(#loc155)
|
| 113 |
+
%tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x2048xi1> loc(#loc156)
|
| 114 |
+
%tmp15 = arith.constant false loc(#loc157)
|
| 115 |
+
%tmp15_66 = arith.constant dense<false> : tensor<1x1xi1> loc(#loc157)
|
| 116 |
+
%tmp16 = arith.constant dense<false> : tensor<1x2048xi1> loc(#loc158)
|
| 117 |
+
%tmp16_67 = arith.ori %tmp16, %tmp14 : tensor<1x2048xi1> loc(#loc158)
|
| 118 |
+
%tmp17 = tt.splat %ks5 : i64 -> tensor<1x2048xi64> loc(#loc159)
|
| 119 |
+
%tmp18 = arith.cmpi sge, %tmp8_56, %tmp17 : tensor<1x2048xi64> loc(#loc160)
|
| 120 |
+
%tmp19 = arith.remsi %tmp8_56, %tmp17 : tensor<1x2048xi64> loc(#loc161)
|
| 121 |
+
%tmp20 = arith.constant 0 : i32 loc(#loc162)
|
| 122 |
+
%tmp20_68 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc162)
|
| 123 |
+
%tmp21 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc163)
|
| 124 |
+
%tmp21_69 = tt.broadcast %tmp21 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc163)
|
| 125 |
+
%tmp21_70 = arith.cmpi ne, %tmp19, %tmp21_69 : tensor<1x2048xi64> loc(#loc163)
|
| 126 |
+
%tmp22 = arith.constant 0 : i32 loc(#loc164)
|
| 127 |
+
%tmp22_71 = arith.extsi %tmp22 : i32 to i64 loc(#loc164)
|
| 128 |
+
%tmp22_72 = tt.splat %tmp22_71 : i64 -> tensor<1x2048xi64> loc(#loc164)
|
| 129 |
+
%tmp22_73 = arith.cmpi slt, %tmp19, %tmp22_72 : tensor<1x2048xi64> loc(#loc164)
|
| 130 |
+
%tmp23 = arith.constant 0 : i32 loc(#loc165)
|
| 131 |
+
%tmp23_74 = arith.extsi %tmp23 : i32 to i64 loc(#loc165)
|
| 132 |
+
%tmp23_75 = tt.splat %tmp23_74 : i64 -> tensor<1x2048xi64> loc(#loc165)
|
| 133 |
+
%tmp23_76 = arith.cmpi slt, %tmp17, %tmp23_75 : tensor<1x2048xi64> loc(#loc165)
|
| 134 |
+
%tmp24 = arith.cmpi ne, %tmp22_73, %tmp23_76 : tensor<1x2048xi1> loc(#loc166)
|
| 135 |
+
%tmp25 = arith.andi %tmp21_70, %tmp24 : tensor<1x2048xi1> loc(#loc167)
|
| 136 |
+
%tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x2048xi64> loc(#loc168)
|
| 137 |
+
%tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc169)
|
| 138 |
+
%tmp28 = arith.cmpi slt, %tmp27, %tmp10_65 : tensor<1x2048xi64> loc(#loc170)
|
| 139 |
+
%tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x2048xi1> loc(#loc171)
|
| 140 |
+
%tmp30 = arith.constant -1 : i32 loc(#loc172)
|
| 141 |
+
%tmp30_77 = arith.constant -1 : i32 loc(#loc172)
|
| 142 |
+
%tmp30_78 = arith.constant dense<-1> : tensor<1x2048xi32> loc(#loc172)
|
| 143 |
+
%tmp30_79 = arith.muli %tmp30_78, %r0_4_27 : tensor<1x2048xi32> loc(#loc172)
|
| 144 |
+
%tmp30_80 = arith.addi %r0_3_30, %tmp30_79 : tensor<1x2048xi32> loc(#loc173)
|
| 145 |
+
%tmp30_81 = arith.constant -128 : i32 loc(#loc174)
|
| 146 |
+
%tmp30_82 = arith.constant -128 : i64 loc(#loc174)
|
| 147 |
+
%tmp30_83 = arith.constant dense<-128> : tensor<1x1xi64> loc(#loc174)
|
| 148 |
+
%tmp30_84 = arith.muli %tmp30_83, %x1_12 : tensor<1x1xi64> loc(#loc174)
|
| 149 |
+
%tmp30_85 = arith.extsi %tmp30_80 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc175)
|
| 150 |
+
%tmp30_86 = tt.broadcast %tmp30_84 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc175)
|
| 151 |
+
%tmp30_87 = arith.addi %tmp30_85, %tmp30_86 : tensor<1x2048xi64> loc(#loc175)
|
| 152 |
+
%tmp30_88 = arith.constant 128 : i32 loc(#loc176)
|
| 153 |
+
%tmp30_89 = arith.constant 128 : i64 loc(#loc176)
|
| 154 |
+
%tmp30_90 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc176)
|
| 155 |
+
%tmp30_91 = arith.muli %tmp30_90, %x0_14 : tensor<1x1xi64> loc(#loc176)
|
| 156 |
+
%tmp30_92 = tt.broadcast %tmp30_91 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc177)
|
| 157 |
+
%tmp30_93 = arith.addi %tmp30_87, %tmp30_92 : tensor<1x2048xi64> loc(#loc177)
|
| 158 |
+
%tmp31 = arith.remsi %tmp30_93, %tmp17 : tensor<1x2048xi64> loc(#loc178)
|
| 159 |
+
%tmp32 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc179)
|
| 160 |
+
%tmp32_94 = tt.broadcast %tmp32 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc179)
|
| 161 |
+
%tmp32_95 = arith.cmpi ne, %tmp31, %tmp32_94 : tensor<1x2048xi64> loc(#loc179)
|
| 162 |
+
%tmp33 = arith.constant 0 : i32 loc(#loc180)
|
| 163 |
+
%tmp33_96 = arith.extsi %tmp33 : i32 to i64 loc(#loc180)
|
| 164 |
+
%tmp33_97 = tt.splat %tmp33_96 : i64 -> tensor<1x2048xi64> loc(#loc180)
|
| 165 |
+
%tmp33_98 = arith.cmpi slt, %tmp31, %tmp33_97 : tensor<1x2048xi64> loc(#loc180)
|
| 166 |
+
%tmp34 = arith.cmpi ne, %tmp33_98, %tmp23_76 : tensor<1x2048xi1> loc(#loc181)
|
| 167 |
+
%tmp35 = arith.andi %tmp32_95, %tmp34 : tensor<1x2048xi1> loc(#loc182)
|
| 168 |
+
%tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x2048xi64> loc(#loc183)
|
| 169 |
+
%tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc184)
|
| 170 |
+
%tmp38 = arith.constant 0 : i64 loc(#loc185)
|
| 171 |
+
%tmp38_99 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc185)
|
| 172 |
+
%tmp39 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc186)
|
| 173 |
+
%tmp39_100 = arith.cmpi eq, %tmp37, %tmp39 : tensor<1x2048xi64> loc(#loc186)
|
| 174 |
+
%tmp40 = arith.andi %tmp29, %tmp39_100 : tensor<1x2048xi1> loc(#loc187)
|
| 175 |
+
%tmp41 = arith.ori %tmp16_67, %tmp40 : tensor<1x2048xi1> loc(#loc188)
|
| 176 |
+
%tmp42 = arith.constant false loc(#loc189)
|
| 177 |
+
%tmp42_101 = arith.constant dense<false> : tensor<1x2048xi1> loc(#loc189)
|
| 178 |
+
%tmp43 = arith.select %tmp6, %tmp41, %tmp42_101 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc190)
|
| 179 |
+
%tmp44 = arith.extui %tmp43 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc191)
|
| 180 |
+
%tmp47 = arith.addi %_tmp46_22, %tmp44 : tensor<1x2048xi64> loc(#loc192)
|
| 181 |
+
%_tmp46_102 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc193)
|
| 182 |
+
%_tmp46_103 = arith.andi %r0_mask_24, %_tmp46_102 : tensor<1x2048xi1> loc(#loc193)
|
| 183 |
+
%_tmp46_104 = arith.select %_tmp46_103, %tmp47, %_tmp46_22 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc194)
|
| 184 |
+
scf.yield %_tmp46_104 : tensor<1x2048xi64> loc(#loc79)
|
| 185 |
+
} loc(#loc131)
|
| 186 |
+
%tmp46 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp46_18) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc195)
|
| 187 |
+
%tmp46_19 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc196)
|
| 188 |
+
%tmp48 = arith.constant 0 : i64 loc(#loc197)
|
| 189 |
+
%tmp48_20 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc197)
|
| 190 |
+
%tmp49 = arith.cmpi sgt, %tmp46_19, %tmp48_20 : tensor<1x1xi64> loc(#loc198)
|
| 191 |
+
%tmp50 = arith.constant 16384 : i64 loc(#loc199)
|
| 192 |
+
%tmp50_21 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc199)
|
| 193 |
+
%tmp51 = arith.cmpi slt, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc200)
|
| 194 |
+
%tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc201)
|
| 195 |
+
%tmp53 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc202)
|
| 196 |
+
%tmp54 = arith.extsi %tmp53 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc203)
|
| 197 |
+
%tmp55 = arith.cmpi eq, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc204)
|
| 198 |
+
%tmp56 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc205)
|
| 199 |
+
%tmp57 = arith.extsi %tmp56 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc206)
|
| 200 |
+
%4 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc92)
|
| 201 |
+
%5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr<i32>>, tensor<1x1xi32> loc(#loc92)
|
| 202 |
+
tt.store %5, %tmp54, %xmask_7 : tensor<1x1x!tt.ptr<i32>> loc(#loc93)
|
| 203 |
+
%6 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc94)
|
| 204 |
+
%7 = tt.addptr %6, %xindex_6 : tensor<1x1x!tt.ptr<i32>>, tensor<1x1xi32> loc(#loc94)
|
| 205 |
+
tt.store %7, %tmp57, %xmask_7 : tensor<1x1x!tt.ptr<i32>> loc(#loc95)
|
| 206 |
+
tt.return loc(#loc96)
|
| 207 |
+
} loc(#loc)
|
| 208 |
+
tt.func private @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xi64> loc("input"(#loc97))) -> tensor<1xi64> attributes {noinline = false} {
|
| 209 |
+
%0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
|
| 210 |
+
^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)):
|
| 211 |
+
%2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc98)
|
| 212 |
+
tt.reduce.return %2 : i64 loc(#loc98)
|
| 213 |
+
}) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc98)
|
| 214 |
+
tt.return %0 : tensor<1xi64> loc(#loc100)
|
| 215 |
+
^bb1: // no predecessors
|
| 216 |
+
%1 = ub.poison : tensor<1xi64> loc(#loc101)
|
| 217 |
+
tt.return %1 : tensor<1xi64> loc(#loc101)
|
| 218 |
+
} loc(#loc97)
|
| 219 |
+
tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc102)), %b: i64 loc("b"(#loc102))) -> i64 attributes {noinline = false} {
|
| 220 |
+
%0 = arith.addi %a, %b : i64 loc(#loc103)
|
| 221 |
+
tt.return %0 : i64 loc(#loc104)
|
| 222 |
+
^bb1: // no predecessors
|
| 223 |
+
%1 = ub.poison : i64 loc(#loc105)
|
| 224 |
+
tt.return %1 : i64 loc(#loc105)
|
| 225 |
+
} loc(#loc102)
|
| 226 |
+
} loc(#loc)
|
| 227 |
+
#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":19:15)
|
| 228 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:28)
|
| 229 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:33)
|
| 230 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":23:36)
|
| 231 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":23:44)
|
| 232 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":23:23)
|
| 233 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":24:21)
|
| 234 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:27)
|
| 235 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:37)
|
| 236 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:21)
|
| 237 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:28)
|
| 238 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":28:19)
|
| 239 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":29:19)
|
| 240 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":30:44)
|
| 241 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":32:40)
|
| 242 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":33:31)
|
| 243 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":34:29)
|
| 244 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":37:27)
|
| 245 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":38:27)
|
| 246 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:26)
|
| 247 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:22)
|
| 248 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":41:22)
|
| 249 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:26)
|
| 250 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:22)
|
| 251 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":44:22)
|
| 252 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":45:22)
|
| 253 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":46:26)
|
| 254 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":46:22)
|
| 255 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":47:26)
|
| 256 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":47:22)
|
| 257 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":48:23)
|
| 258 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:55)
|
| 259 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:35)
|
| 260 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:87)
|
| 261 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:94)
|
| 262 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:77)
|
| 263 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":50:23)
|
| 264 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":51:23)
|
| 265 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":52:24)
|
| 266 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":53:23)
|
| 267 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":54:39)
|
| 268 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":55:24)
|
| 269 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":56:37)
|
| 270 |
+
#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":57:24)
|
| 271 |
+
#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":58:24)
|
| 272 |
+
#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":59:35)
|
| 273 |
+
#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":60:25)
|
| 274 |
+
#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":61:92)
|
| 275 |
+
#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":62:92)
|
| 276 |
+
#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":63:25)
|
| 277 |
+
#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":64:24)
|
| 278 |
+
#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":65:24)
|
| 279 |
+
#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":66:39)
|
| 280 |
+
#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":67:24)
|
| 281 |
+
#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":68:24)
|
| 282 |
+
#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:29)
|
| 283 |
+
#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:24)
|
| 284 |
+
#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:45)
|
| 285 |
+
#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:38)
|
| 286 |
+
#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:55)
|
| 287 |
+
#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:51)
|
| 288 |
+
#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":70:25)
|
| 289 |
+
#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":71:25)
|
| 290 |
+
#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":72:92)
|
| 291 |
+
#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":73:25)
|
| 292 |
+
#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":74:24)
|
| 293 |
+
#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":75:24)
|
| 294 |
+
#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":76:39)
|
| 295 |
+
#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":77:35)
|
| 296 |
+
#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":78:25)
|
| 297 |
+
#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":79:24)
|
| 298 |
+
#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":80:24)
|
| 299 |
+
#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":81:44)
|
| 300 |
+
#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":82:38)
|
| 301 |
+
#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":83:25)
|
| 302 |
+
#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":85:25)
|
| 303 |
+
#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:36)
|
| 304 |
+
#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:50)
|
| 305 |
+
#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:8)
|
| 306 |
+
#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:27)
|
| 307 |
+
#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:30)
|
| 308 |
+
#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":88:31)
|
| 309 |
+
#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":89:20)
|
| 310 |
+
#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":90:35)
|
| 311 |
+
#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":91:20)
|
| 312 |
+
#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":92:20)
|
| 313 |
+
#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":93:21)
|
| 314 |
+
#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":94:21)
|
| 315 |
+
#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":95:21)
|
| 316 |
+
#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":96:21)
|
| 317 |
+
#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":97:21)
|
| 318 |
+
#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:25)
|
| 319 |
+
#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:37)
|
| 320 |
+
#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:25)
|
| 321 |
+
#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:37)
|
| 322 |
+
#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:4)
|
| 323 |
+
#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 324 |
+
#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
|
| 325 |
+
#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
|
| 326 |
+
#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 327 |
+
#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
|
| 328 |
+
#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
|
| 329 |
+
#loc117 = loc("r0_numel"(#loc1))
|
| 330 |
+
#loc118 = loc("xoffset"(#loc2))
|
| 331 |
+
#loc119 = loc("xoffset"(#loc3))
|
| 332 |
+
#loc120 = loc("xindex"(#loc4))
|
| 333 |
+
#loc121 = loc("xindex"(#loc5))
|
| 334 |
+
#loc122 = loc("xindex"(#loc6))
|
| 335 |
+
#loc123 = loc("xmask"(#loc7))
|
| 336 |
+
#loc124 = loc("r0_base"(#loc8))
|
| 337 |
+
#loc125 = loc("r0_base"(#loc9))
|
| 338 |
+
#loc126 = loc("x1"(#loc10))
|
| 339 |
+
#loc127 = loc("x1"(#loc11))
|
| 340 |
+
#loc128 = loc("x0"(#loc12))
|
| 341 |
+
#loc129 = loc("x2"(#loc13))
|
| 342 |
+
#loc130 = loc("_tmp46"(#loc14))
|
| 343 |
+
#loc131 = loc("_tmp46"(#loc15))
|
| 344 |
+
#loc132 = loc("r0_index"(#loc16))
|
| 345 |
+
#loc133 = loc("r0_mask"(#loc17))
|
| 346 |
+
#loc134 = loc("r0_4"(#loc18))
|
| 347 |
+
#loc135 = loc("r0_3"(#loc19))
|
| 348 |
+
#loc136 = loc("tmp0"(#loc20))
|
| 349 |
+
#loc137 = loc("tmp0"(#loc21))
|
| 350 |
+
#loc138 = loc("tmp2"(#loc22))
|
| 351 |
+
#loc139 = loc("tmp3"(#loc23))
|
| 352 |
+
#loc140 = loc("tmp3"(#loc24))
|
| 353 |
+
#loc141 = loc("tmp5"(#loc25))
|
| 354 |
+
#loc142 = loc("tmp6"(#loc26))
|
| 355 |
+
#loc143 = loc("tmp7"(#loc27))
|
| 356 |
+
#loc144 = loc("tmp7"(#loc28))
|
| 357 |
+
#loc145 = loc("tmp8"(#loc29))
|
| 358 |
+
#loc146 = loc("tmp8"(#loc30))
|
| 359 |
+
#loc147 = loc("tmp9"(#loc31))
|
| 360 |
+
#loc148 = loc("tmp10"(#loc32))
|
| 361 |
+
#loc149 = loc("tmp10"(#loc33))
|
| 362 |
+
#loc150 = loc("tmp10"(#loc34))
|
| 363 |
+
#loc151 = loc("tmp10"(#loc35))
|
| 364 |
+
#loc152 = loc("tmp10"(#loc36))
|
| 365 |
+
#loc153 = loc("tmp11"(#loc37))
|
| 366 |
+
#loc154 = loc("tmp12"(#loc38))
|
| 367 |
+
#loc155 = loc("tmp13"(#loc39))
|
| 368 |
+
#loc156 = loc("tmp14"(#loc40))
|
| 369 |
+
#loc157 = loc("tmp15"(#loc41))
|
| 370 |
+
#loc158 = loc("tmp16"(#loc42))
|
| 371 |
+
#loc159 = loc("tmp17"(#loc43))
|
| 372 |
+
#loc160 = loc("tmp18"(#loc44))
|
| 373 |
+
#loc161 = loc("tmp19"(#loc45))
|
| 374 |
+
#loc162 = loc("tmp20"(#loc46))
|
| 375 |
+
#loc163 = loc("tmp21"(#loc47))
|
| 376 |
+
#loc164 = loc("tmp22"(#loc48))
|
| 377 |
+
#loc165 = loc("tmp23"(#loc49))
|
| 378 |
+
#loc166 = loc("tmp24"(#loc50))
|
| 379 |
+
#loc167 = loc("tmp25"(#loc51))
|
| 380 |
+
#loc168 = loc("tmp26"(#loc52))
|
| 381 |
+
#loc169 = loc("tmp27"(#loc53))
|
| 382 |
+
#loc170 = loc("tmp28"(#loc54))
|
| 383 |
+
#loc171 = loc("tmp29"(#loc55))
|
| 384 |
+
#loc172 = loc("tmp30"(#loc56))
|
| 385 |
+
#loc173 = loc("tmp30"(#loc57))
|
| 386 |
+
#loc174 = loc("tmp30"(#loc58))
|
| 387 |
+
#loc175 = loc("tmp30"(#loc59))
|
| 388 |
+
#loc176 = loc("tmp30"(#loc60))
|
| 389 |
+
#loc177 = loc("tmp30"(#loc61))
|
| 390 |
+
#loc178 = loc("tmp31"(#loc62))
|
| 391 |
+
#loc179 = loc("tmp32"(#loc63))
|
| 392 |
+
#loc180 = loc("tmp33"(#loc64))
|
| 393 |
+
#loc181 = loc("tmp34"(#loc65))
|
| 394 |
+
#loc182 = loc("tmp35"(#loc66))
|
| 395 |
+
#loc183 = loc("tmp36"(#loc67))
|
| 396 |
+
#loc184 = loc("tmp37"(#loc68))
|
| 397 |
+
#loc185 = loc("tmp38"(#loc69))
|
| 398 |
+
#loc186 = loc("tmp39"(#loc70))
|
| 399 |
+
#loc187 = loc("tmp40"(#loc71))
|
| 400 |
+
#loc188 = loc("tmp41"(#loc72))
|
| 401 |
+
#loc189 = loc("tmp42"(#loc73))
|
| 402 |
+
#loc190 = loc("tmp43"(#loc74))
|
| 403 |
+
#loc191 = loc("tmp44"(#loc75))
|
| 404 |
+
#loc192 = loc("tmp47"(#loc76))
|
| 405 |
+
#loc193 = loc("_tmp46"(#loc77))
|
| 406 |
+
#loc194 = loc("_tmp46"(#loc78))
|
| 407 |
+
#loc195 = loc("tmp46"(#loc80))
|
| 408 |
+
#loc196 = loc("tmp46"(#loc81))
|
| 409 |
+
#loc197 = loc("tmp48"(#loc82))
|
| 410 |
+
#loc198 = loc("tmp49"(#loc83))
|
| 411 |
+
#loc199 = loc("tmp50"(#loc84))
|
| 412 |
+
#loc200 = loc("tmp51"(#loc85))
|
| 413 |
+
#loc201 = loc("tmp52"(#loc86))
|
| 414 |
+
#loc202 = loc("tmp53"(#loc87))
|
| 415 |
+
#loc203 = loc("tmp54"(#loc88))
|
| 416 |
+
#loc204 = loc("tmp55"(#loc89))
|
| 417 |
+
#loc205 = loc("tmp56"(#loc90))
|
| 418 |
+
#loc206 = loc("tmp57"(#loc91))
|
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}>
|
| 2 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":18:0)
|
| 3 |
+
#loc1 = loc(unknown)
|
| 4 |
+
#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:27)
|
| 5 |
+
#loc79 = loc("in_ptr0"(#loc))
|
| 6 |
+
#loc80 = loc("out_ptr1"(#loc))
|
| 7 |
+
#loc81 = loc("out_ptr2"(#loc))
|
| 8 |
+
#loc82 = loc("ks0"(#loc))
|
| 9 |
+
#loc83 = loc("ks1"(#loc))
|
| 10 |
+
#loc84 = loc("ks2"(#loc))
|
| 11 |
+
#loc85 = loc("ks3"(#loc))
|
| 12 |
+
#loc86 = loc("ks4"(#loc))
|
| 13 |
+
#loc87 = loc("ks5"(#loc))
|
| 14 |
+
#loc88 = loc("xnumel"(#loc))
|
| 15 |
+
#loc89 = loc("r0_numel"(#loc))
|
| 16 |
+
#loc149 = loc("tmp46"(#loc63))
|
| 17 |
+
#loc164 = loc(callsite(#loc1 at #loc149))
|
| 18 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 19 |
+
tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 20 |
+
%cst = arith.constant dense<128> : tensor<1x2048xi32, #blocked> loc(#loc1)
|
| 21 |
+
%cst_0 = arith.constant dense<16384> : tensor<1x2048xi32, #blocked> loc(#loc1)
|
| 22 |
+
%c-128_i64 = arith.constant -128 : i64 loc(#loc1)
|
| 23 |
+
%c0_i64 = arith.constant 0 : i64 loc(#loc1)
|
| 24 |
+
%c128_i64 = arith.constant 128 : i64 loc(#loc1)
|
| 25 |
+
%c2048_i32 = arith.constant 2048 : i32 loc(#loc1)
|
| 26 |
+
%c16384_i32 = arith.constant 16384 : i32 loc(#loc1)
|
| 27 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
|
| 28 |
+
%cst_1 = arith.constant dense<16384> : tensor<1x1xi64, #blocked> loc(#loc1)
|
| 29 |
+
%cst_2 = arith.constant dense<0> : tensor<1x1xi64, #blocked> loc(#loc1)
|
| 30 |
+
%cst_3 = arith.constant dense<false> : tensor<1x2048xi1, #blocked> loc(#loc1)
|
| 31 |
+
%cst_4 = arith.constant dense<0> : tensor<1x2048xi64, #blocked> loc(#loc1)
|
| 32 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc90)
|
| 33 |
+
%xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc91)
|
| 34 |
+
%r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc92)
|
| 35 |
+
%r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc92)
|
| 36 |
+
%x1 = arith.extsi %xoffset : i32 to i64 loc(#loc93)
|
| 37 |
+
%x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc93)
|
| 38 |
+
%x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc94)
|
| 39 |
+
%x0 = arith.remsi %x1, %ks0 : i64 loc(#loc95)
|
| 40 |
+
%x2 = arith.divsi %x1, %ks4 : i64 loc(#loc96)
|
| 41 |
+
%tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc97)
|
| 42 |
+
%tmp0_8 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc159)
|
| 43 |
+
%tmp2 = tt.splat %ks2 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc99)
|
| 44 |
+
%tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc100)
|
| 45 |
+
%tmp3_9 = tt.splat %tmp3 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc160)
|
| 46 |
+
%tmp5 = tt.splat %ks3 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc102)
|
| 47 |
+
%tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr<i64>, i64 loc(#loc103)
|
| 48 |
+
%tmp10_10 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc161)
|
| 49 |
+
%tmp10_11 = tt.splat %tmp10 : !tt.ptr<i64> -> tensor<1x2048x!tt.ptr<i64>, #blocked> loc(#loc105)
|
| 50 |
+
%tmp17 = tt.splat %ks5 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc106)
|
| 51 |
+
%tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc107)
|
| 52 |
+
%tmp23_12 = tt.splat %tmp23 : i1 -> tensor<1x2048xi1, #blocked> loc(#loc107)
|
| 53 |
+
%tmp30 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc108)
|
| 54 |
+
%tmp30_13 = tt.splat %tmp30 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc162)
|
| 55 |
+
%_tmp46 = scf.for %_tmp46_15 = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%arg12 = %cst_4) -> (tensor<1x2048xi64, #blocked>) : i32 {
|
| 56 |
+
%r0_index = tt.splat %_tmp46_15 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc111)
|
| 57 |
+
%r0_index_16 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32, #blocked> loc(#loc111)
|
| 58 |
+
%r0_mask = arith.cmpi slt, %r0_index_16, %cst_0 : tensor<1x2048xi32, #blocked> loc(#loc112)
|
| 59 |
+
%r0_4 = arith.divsi %r0_index_16, %cst : tensor<1x2048xi32, #blocked> loc(#loc113)
|
| 60 |
+
%r0_3 = arith.remsi %r0_index_16, %cst : tensor<1x2048xi32, #blocked> loc(#loc114)
|
| 61 |
+
%tmp0_17 = arith.extsi %r0_4 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc98)
|
| 62 |
+
%tmp0_18 = arith.addi %tmp0_17, %tmp0_8 : tensor<1x2048xi64, #blocked> loc(#loc98)
|
| 63 |
+
%tmp2_19 = arith.cmpi slt, %tmp0_18, %tmp2 : tensor<1x2048xi64, #blocked> loc(#loc99)
|
| 64 |
+
%tmp3_20 = arith.extsi %r0_3 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc101)
|
| 65 |
+
%tmp3_21 = arith.addi %tmp3_20, %tmp3_9 : tensor<1x2048xi64, #blocked> loc(#loc101)
|
| 66 |
+
%tmp5_22 = arith.cmpi slt, %tmp3_21, %tmp5 : tensor<1x2048xi64, #blocked> loc(#loc102)
|
| 67 |
+
%tmp6 = arith.andi %tmp2_19, %tmp5_22 : tensor<1x2048xi1, #blocked> loc(#loc115)
|
| 68 |
+
%tmp9 = arith.cmpi sge, %tmp0_18, %tmp3_21 : tensor<1x2048xi64, #blocked> loc(#loc116)
|
| 69 |
+
%tmp10_23 = arith.andi %r0_mask, %tmp6 : tensor<1x2048xi1, #blocked> loc(#loc117)
|
| 70 |
+
%tmp10_24 = arith.andi %tmp10_23, %tmp10_10 : tensor<1x2048xi1, #blocked> loc(#loc104)
|
| 71 |
+
%tmp10_25 = tt.load %tmp10_11, %tmp10_24, %cst_4 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<i64>, #blocked> loc(#loc105)
|
| 72 |
+
%tmp11 = arith.cmpi slt, %tmp3_21, %tmp10_25 : tensor<1x2048xi64, #blocked> loc(#loc118)
|
| 73 |
+
%tmp12 = arith.cmpi slt, %tmp0_18, %tmp10_25 : tensor<1x2048xi64, #blocked> loc(#loc119)
|
| 74 |
+
%tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x2048xi1, #blocked> loc(#loc120)
|
| 75 |
+
%tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x2048xi1, #blocked> loc(#loc121)
|
| 76 |
+
%tmp18 = arith.cmpi sge, %tmp3_21, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc122)
|
| 77 |
+
%tmp19 = arith.remsi %tmp3_21, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc123)
|
| 78 |
+
%tmp21 = arith.cmpi ne, %tmp19, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc124)
|
| 79 |
+
%tmp22 = arith.cmpi slt, %tmp19, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc125)
|
| 80 |
+
%tmp24 = arith.cmpi ne, %tmp22, %tmp23_12 : tensor<1x2048xi1, #blocked> loc(#loc126)
|
| 81 |
+
%tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x2048xi1, #blocked> loc(#loc127)
|
| 82 |
+
%tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc128)
|
| 83 |
+
%tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc129)
|
| 84 |
+
%tmp28 = arith.cmpi slt, %tmp27, %tmp10_25 : tensor<1x2048xi64, #blocked> loc(#loc130)
|
| 85 |
+
%tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x2048xi1, #blocked> loc(#loc131)
|
| 86 |
+
%tmp30_26 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32, #blocked> loc(#loc132)
|
| 87 |
+
%tmp30_27 = arith.extsi %tmp30_26 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc109)
|
| 88 |
+
%tmp30_28 = arith.addi %tmp30_27, %tmp30_13 : tensor<1x2048xi64, #blocked> loc(#loc109)
|
| 89 |
+
%tmp30_29 = arith.addi %tmp30_28, %tmp3_9 : tensor<1x2048xi64, #blocked> loc(#loc133)
|
| 90 |
+
%tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc134)
|
| 91 |
+
%tmp32 = arith.cmpi ne, %tmp31, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc135)
|
| 92 |
+
%tmp33 = arith.cmpi slt, %tmp31, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc136)
|
| 93 |
+
%tmp34 = arith.cmpi ne, %tmp33, %tmp23_12 : tensor<1x2048xi1, #blocked> loc(#loc137)
|
| 94 |
+
%tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x2048xi1, #blocked> loc(#loc138)
|
| 95 |
+
%tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc139)
|
| 96 |
+
%tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc140)
|
| 97 |
+
%tmp39 = arith.cmpi eq, %tmp37, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc141)
|
| 98 |
+
%tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x2048xi1, #blocked> loc(#loc142)
|
| 99 |
+
%tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x2048xi1, #blocked> loc(#loc143)
|
| 100 |
+
%tmp43 = arith.select %tmp6, %tmp41, %cst_3 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi1, #blocked> loc(#loc144)
|
| 101 |
+
%tmp44 = arith.extui %tmp43 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc145)
|
| 102 |
+
%tmp47 = arith.addi %arg12, %tmp44 : tensor<1x2048xi64, #blocked> loc(#loc146)
|
| 103 |
+
%_tmp46_30 = arith.andi %r0_mask, %tmp10_10 : tensor<1x2048xi1, #blocked> loc(#loc147)
|
| 104 |
+
%_tmp46_31 = arith.select %_tmp46_30, %tmp47, %arg12 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc148)
|
| 105 |
+
scf.yield %_tmp46_31 : tensor<1x2048xi64, #blocked> loc(#loc61)
|
| 106 |
+
} loc(#loc110)
|
| 107 |
+
%tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({
|
| 108 |
+
^bb0(%tmp46_15: i64 loc(callsite(#loc1 at #loc149)), %tmp46_16: i64 loc(callsite(#loc1 at #loc149))):
|
| 109 |
+
%tmp46_17 = arith.addi %tmp46_15, %tmp46_16 : i64 loc(#loc167)
|
| 110 |
+
tt.reduce.return %tmp46_17 : i64 loc(#loc163)
|
| 111 |
+
}) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc163)
|
| 112 |
+
%tmp46_14 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc150)
|
| 113 |
+
%tmp49 = arith.cmpi sgt, %tmp46_14, %cst_2 : tensor<1x1xi64, #blocked> loc(#loc151)
|
| 114 |
+
%tmp51 = arith.cmpi slt, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc152)
|
| 115 |
+
%tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1, #blocked> loc(#loc153)
|
| 116 |
+
%tmp54 = arith.extui %tmp52 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc165)
|
| 117 |
+
%tmp55 = arith.cmpi eq, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc156)
|
| 118 |
+
%tmp57 = arith.extui %tmp55 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc166)
|
| 119 |
+
%0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr<i32>, i32 loc(#loc74)
|
| 120 |
+
%1 = tt.splat %0 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc75)
|
| 121 |
+
%2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc75)
|
| 122 |
+
tt.store %1, %tmp54, %2 : tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc75)
|
| 123 |
+
%3 = tt.addptr %out_ptr2, %xoffset : !tt.ptr<i32>, i32 loc(#loc76)
|
| 124 |
+
%4 = tt.splat %3 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc77)
|
| 125 |
+
tt.store %4, %tmp57, %2 : tensor<1x1x!tt.ptr<i32>, #blocked> loc(#loc77)
|
| 126 |
+
tt.return loc(#loc78)
|
| 127 |
+
} loc(#loc)
|
| 128 |
+
} loc(#loc)
|
| 129 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:28)
|
| 130 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":24:21)
|
| 131 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:37)
|
| 132 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:21)
|
| 133 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:28)
|
| 134 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":28:19)
|
| 135 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":29:19)
|
| 136 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:26)
|
| 137 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:22)
|
| 138 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":41:22)
|
| 139 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:26)
|
| 140 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:22)
|
| 141 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":44:22)
|
| 142 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:35)
|
| 143 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:94)
|
| 144 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:77)
|
| 145 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":56:37)
|
| 146 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":62:92)
|
| 147 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:45)
|
| 148 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:38)
|
| 149 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":32:40)
|
| 150 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":33:31)
|
| 151 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":34:29)
|
| 152 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":37:27)
|
| 153 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":38:27)
|
| 154 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":45:22)
|
| 155 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":48:23)
|
| 156 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:87)
|
| 157 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":50:23)
|
| 158 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":51:23)
|
| 159 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":52:24)
|
| 160 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":53:23)
|
| 161 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":57:24)
|
| 162 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":58:24)
|
| 163 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":60:25)
|
| 164 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":61:92)
|
| 165 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":63:25)
|
| 166 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":64:24)
|
| 167 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":65:24)
|
| 168 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":66:39)
|
| 169 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":67:24)
|
| 170 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":68:24)
|
| 171 |
+
#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:24)
|
| 172 |
+
#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:51)
|
| 173 |
+
#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":70:25)
|
| 174 |
+
#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":71:25)
|
| 175 |
+
#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":72:92)
|
| 176 |
+
#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":73:25)
|
| 177 |
+
#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":74:24)
|
| 178 |
+
#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":75:24)
|
| 179 |
+
#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":76:39)
|
| 180 |
+
#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":78:25)
|
| 181 |
+
#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":79:24)
|
| 182 |
+
#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":80:24)
|
| 183 |
+
#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":82:38)
|
| 184 |
+
#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":83:25)
|
| 185 |
+
#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":85:25)
|
| 186 |
+
#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:36)
|
| 187 |
+
#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:50)
|
| 188 |
+
#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:8)
|
| 189 |
+
#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 190 |
+
#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 191 |
+
#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:30)
|
| 192 |
+
#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":89:20)
|
| 193 |
+
#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":91:20)
|
| 194 |
+
#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":92:20)
|
| 195 |
+
#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":94:21)
|
| 196 |
+
#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":93:21)
|
| 197 |
+
#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":95:21)
|
| 198 |
+
#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":97:21)
|
| 199 |
+
#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":96:21)
|
| 200 |
+
#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:25)
|
| 201 |
+
#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:37)
|
| 202 |
+
#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:25)
|
| 203 |
+
#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:37)
|
| 204 |
+
#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:4)
|
| 205 |
+
#loc90 = loc("xoffset"(#loc2))
|
| 206 |
+
#loc91 = loc("xmask"(#loc3))
|
| 207 |
+
#loc92 = loc("r0_base"(#loc4))
|
| 208 |
+
#loc93 = loc("x1"(#loc5))
|
| 209 |
+
#loc94 = loc("x1"(#loc6))
|
| 210 |
+
#loc95 = loc("x0"(#loc7))
|
| 211 |
+
#loc96 = loc("x2"(#loc8))
|
| 212 |
+
#loc97 = loc("tmp0"(#loc9))
|
| 213 |
+
#loc98 = loc("tmp0"(#loc10))
|
| 214 |
+
#loc99 = loc("tmp2"(#loc11))
|
| 215 |
+
#loc100 = loc("tmp3"(#loc12))
|
| 216 |
+
#loc101 = loc("tmp3"(#loc13))
|
| 217 |
+
#loc102 = loc("tmp5"(#loc14))
|
| 218 |
+
#loc103 = loc("tmp10"(#loc15))
|
| 219 |
+
#loc104 = loc("tmp10"(#loc16))
|
| 220 |
+
#loc105 = loc("tmp10"(#loc17))
|
| 221 |
+
#loc106 = loc("tmp17"(#loc18))
|
| 222 |
+
#loc107 = loc("tmp23"(#loc19))
|
| 223 |
+
#loc108 = loc("tmp30"(#loc20))
|
| 224 |
+
#loc109 = loc("tmp30"(#loc21))
|
| 225 |
+
#loc110 = loc("_tmp46"(#loc22))
|
| 226 |
+
#loc111 = loc("r0_index"(#loc23))
|
| 227 |
+
#loc112 = loc("r0_mask"(#loc24))
|
| 228 |
+
#loc113 = loc("r0_4"(#loc25))
|
| 229 |
+
#loc114 = loc("r0_3"(#loc26))
|
| 230 |
+
#loc115 = loc("tmp6"(#loc27))
|
| 231 |
+
#loc116 = loc("tmp9"(#loc28))
|
| 232 |
+
#loc117 = loc("tmp10"(#loc29))
|
| 233 |
+
#loc118 = loc("tmp11"(#loc30))
|
| 234 |
+
#loc119 = loc("tmp12"(#loc31))
|
| 235 |
+
#loc120 = loc("tmp13"(#loc32))
|
| 236 |
+
#loc121 = loc("tmp14"(#loc33))
|
| 237 |
+
#loc122 = loc("tmp18"(#loc34))
|
| 238 |
+
#loc123 = loc("tmp19"(#loc35))
|
| 239 |
+
#loc124 = loc("tmp21"(#loc36))
|
| 240 |
+
#loc125 = loc("tmp22"(#loc37))
|
| 241 |
+
#loc126 = loc("tmp24"(#loc38))
|
| 242 |
+
#loc127 = loc("tmp25"(#loc39))
|
| 243 |
+
#loc128 = loc("tmp26"(#loc40))
|
| 244 |
+
#loc129 = loc("tmp27"(#loc41))
|
| 245 |
+
#loc130 = loc("tmp28"(#loc42))
|
| 246 |
+
#loc131 = loc("tmp29"(#loc43))
|
| 247 |
+
#loc132 = loc("tmp30"(#loc44))
|
| 248 |
+
#loc133 = loc("tmp30"(#loc45))
|
| 249 |
+
#loc134 = loc("tmp31"(#loc46))
|
| 250 |
+
#loc135 = loc("tmp32"(#loc47))
|
| 251 |
+
#loc136 = loc("tmp33"(#loc48))
|
| 252 |
+
#loc137 = loc("tmp34"(#loc49))
|
| 253 |
+
#loc138 = loc("tmp35"(#loc50))
|
| 254 |
+
#loc139 = loc("tmp36"(#loc51))
|
| 255 |
+
#loc140 = loc("tmp37"(#loc52))
|
| 256 |
+
#loc141 = loc("tmp39"(#loc53))
|
| 257 |
+
#loc142 = loc("tmp40"(#loc54))
|
| 258 |
+
#loc143 = loc("tmp41"(#loc55))
|
| 259 |
+
#loc144 = loc("tmp43"(#loc56))
|
| 260 |
+
#loc145 = loc("tmp44"(#loc57))
|
| 261 |
+
#loc146 = loc("tmp47"(#loc58))
|
| 262 |
+
#loc147 = loc("_tmp46"(#loc59))
|
| 263 |
+
#loc148 = loc("_tmp46"(#loc60))
|
| 264 |
+
#loc150 = loc("tmp46"(#loc65))
|
| 265 |
+
#loc151 = loc("tmp49"(#loc66))
|
| 266 |
+
#loc152 = loc("tmp51"(#loc67))
|
| 267 |
+
#loc153 = loc("tmp52"(#loc68))
|
| 268 |
+
#loc154 = loc("tmp54"(#loc69))
|
| 269 |
+
#loc155 = loc("tmp53"(#loc70))
|
| 270 |
+
#loc156 = loc("tmp55"(#loc71))
|
| 271 |
+
#loc157 = loc("tmp57"(#loc72))
|
| 272 |
+
#loc158 = loc("tmp56"(#loc73))
|
| 273 |
+
#loc159 = loc(fused[#loc98, #loc97])
|
| 274 |
+
#loc160 = loc(fused[#loc101, #loc100])
|
| 275 |
+
#loc161 = loc(fused[#loc104, #loc91])
|
| 276 |
+
#loc162 = loc(fused[#loc109, #loc108])
|
| 277 |
+
#loc163 = loc(callsite(#loc62 at #loc149))
|
| 278 |
+
#loc165 = loc(fused[#loc154, #loc155])
|
| 279 |
+
#loc166 = loc(fused[#loc157, #loc158])
|
| 280 |
+
#loc167 = loc(callsite(#loc64 at #loc163))
|
SpecForge-ext/cache/compiled_kernels/triton/3/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":18:0)
|
| 2 |
+
#loc1 = loc(unknown)
|
| 3 |
+
#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:27)
|
| 4 |
+
#loc81 = loc("in_ptr0"(#loc))
|
| 5 |
+
#loc82 = loc("out_ptr1"(#loc))
|
| 6 |
+
#loc83 = loc("out_ptr2"(#loc))
|
| 7 |
+
#loc84 = loc("ks0"(#loc))
|
| 8 |
+
#loc85 = loc("ks1"(#loc))
|
| 9 |
+
#loc86 = loc("ks2"(#loc))
|
| 10 |
+
#loc87 = loc("ks3"(#loc))
|
| 11 |
+
#loc88 = loc("ks4"(#loc))
|
| 12 |
+
#loc89 = loc("ks5"(#loc))
|
| 13 |
+
#loc90 = loc("xnumel"(#loc))
|
| 14 |
+
#loc91 = loc("r0_numel"(#loc))
|
| 15 |
+
#loc153 = loc("tmp46"(#loc65))
|
| 16 |
+
#loc168 = loc(callsite(#loc1 at #loc153))
|
| 17 |
+
module {
|
| 18 |
+
tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 19 |
+
%c-128_i64 = arith.constant -128 : i64 loc(#loc1)
|
| 20 |
+
%c0_i64 = arith.constant 0 : i64 loc(#loc1)
|
| 21 |
+
%c128_i64 = arith.constant 128 : i64 loc(#loc1)
|
| 22 |
+
%c2048_i32 = arith.constant 2048 : i32 loc(#loc2)
|
| 23 |
+
%c16384_i32 = arith.constant 16384 : i32 loc(#loc2)
|
| 24 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc2)
|
| 25 |
+
%tmp50 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc92)
|
| 26 |
+
%cst = arith.constant dense<0> : tensor<1x1xi64> loc(#loc1)
|
| 27 |
+
%cst_0 = arith.constant dense<false> : tensor<1x2048xi1> loc(#loc1)
|
| 28 |
+
%cst_1 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc1)
|
| 29 |
+
%cst_2 = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc1)
|
| 30 |
+
%cst_3 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc1)
|
| 31 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc93)
|
| 32 |
+
%xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc94)
|
| 33 |
+
%xmask_4 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc94)
|
| 34 |
+
%r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc95)
|
| 35 |
+
%r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc96)
|
| 36 |
+
%x1 = arith.extsi %xoffset : i32 to i64 loc(#loc97)
|
| 37 |
+
%x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc97)
|
| 38 |
+
%x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc98)
|
| 39 |
+
%x0 = arith.remsi %x1, %ks0 : i64 loc(#loc99)
|
| 40 |
+
%x2 = arith.divsi %x1, %ks4 : i64 loc(#loc100)
|
| 41 |
+
%_tmp46 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%_tmp46_9 = %cst_3) -> (tensor<1x2048xi64>) : i32 {
|
| 42 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc102)
|
| 43 |
+
%r0_index_10 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32> loc(#loc102)
|
| 44 |
+
%r0_mask = arith.cmpi slt, %r0_index_10, %cst_2 : tensor<1x2048xi32> loc(#loc103)
|
| 45 |
+
%r0_4 = arith.divsi %r0_index_10, %cst_1 : tensor<1x2048xi32> loc(#loc104)
|
| 46 |
+
%r0_3 = arith.remsi %r0_index_10, %cst_1 : tensor<1x2048xi32> loc(#loc105)
|
| 47 |
+
%tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc106)
|
| 48 |
+
%tmp0_11 = arith.extsi %r0_4 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc107)
|
| 49 |
+
%tmp0_12 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64> loc(#loc163)
|
| 50 |
+
%tmp0_13 = arith.addi %tmp0_11, %tmp0_12 : tensor<1x2048xi64> loc(#loc107)
|
| 51 |
+
%tmp2 = tt.splat %ks2 : i64 -> tensor<1x2048xi64> loc(#loc108)
|
| 52 |
+
%tmp2_14 = arith.cmpi slt, %tmp0_13, %tmp2 : tensor<1x2048xi64> loc(#loc108)
|
| 53 |
+
%tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc109)
|
| 54 |
+
%tmp3_15 = arith.extsi %r0_3 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc110)
|
| 55 |
+
%tmp3_16 = tt.splat %tmp3 : i64 -> tensor<1x2048xi64> loc(#loc164)
|
| 56 |
+
%tmp3_17 = arith.addi %tmp3_15, %tmp3_16 : tensor<1x2048xi64> loc(#loc110)
|
| 57 |
+
%tmp5 = tt.splat %ks3 : i64 -> tensor<1x2048xi64> loc(#loc111)
|
| 58 |
+
%tmp5_18 = arith.cmpi slt, %tmp3_17, %tmp5 : tensor<1x2048xi64> loc(#loc111)
|
| 59 |
+
%tmp6 = arith.andi %tmp2_14, %tmp5_18 : tensor<1x2048xi1> loc(#loc112)
|
| 60 |
+
%tmp9 = arith.cmpi sge, %tmp0_13, %tmp3_17 : tensor<1x2048xi64> loc(#loc113)
|
| 61 |
+
%tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr<i64>, i64 loc(#loc114)
|
| 62 |
+
%tmp10_19 = tt.splat %tmp10 : !tt.ptr<i64> -> tensor<1x2048x!tt.ptr<i64>> loc(#loc114)
|
| 63 |
+
%tmp10_20 = arith.andi %r0_mask, %tmp6 : tensor<1x2048xi1> loc(#loc115)
|
| 64 |
+
%tmp10_21 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc165)
|
| 65 |
+
%tmp10_22 = arith.andi %tmp10_20, %tmp10_21 : tensor<1x2048xi1> loc(#loc116)
|
| 66 |
+
%tmp10_23 = tt.load %tmp10_19, %tmp10_22, %cst_3 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<i64>> loc(#loc117)
|
| 67 |
+
%tmp11 = arith.cmpi slt, %tmp3_17, %tmp10_23 : tensor<1x2048xi64> loc(#loc118)
|
| 68 |
+
%tmp12 = arith.cmpi slt, %tmp0_13, %tmp10_23 : tensor<1x2048xi64> loc(#loc119)
|
| 69 |
+
%tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x2048xi1> loc(#loc120)
|
| 70 |
+
%tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x2048xi1> loc(#loc121)
|
| 71 |
+
%tmp17 = tt.splat %ks5 : i64 -> tensor<1x2048xi64> loc(#loc122)
|
| 72 |
+
%tmp18 = arith.cmpi sge, %tmp3_17, %tmp17 : tensor<1x2048xi64> loc(#loc123)
|
| 73 |
+
%tmp19 = arith.remsi %tmp3_17, %tmp17 : tensor<1x2048xi64> loc(#loc124)
|
| 74 |
+
%tmp21 = arith.cmpi ne, %tmp19, %cst_3 : tensor<1x2048xi64> loc(#loc125)
|
| 75 |
+
%tmp22 = arith.cmpi slt, %tmp19, %cst_3 : tensor<1x2048xi64> loc(#loc126)
|
| 76 |
+
%tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc127)
|
| 77 |
+
%tmp23_24 = tt.splat %tmp23 : i1 -> tensor<1x2048xi1> loc(#loc127)
|
| 78 |
+
%tmp24 = arith.cmpi ne, %tmp22, %tmp23_24 : tensor<1x2048xi1> loc(#loc128)
|
| 79 |
+
%tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x2048xi1> loc(#loc129)
|
| 80 |
+
%tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x2048xi64> loc(#loc130)
|
| 81 |
+
%tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc131)
|
| 82 |
+
%tmp28 = arith.cmpi slt, %tmp27, %tmp10_23 : tensor<1x2048xi64> loc(#loc132)
|
| 83 |
+
%tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x2048xi1> loc(#loc133)
|
| 84 |
+
%tmp30 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32> loc(#loc134)
|
| 85 |
+
%tmp30_25 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc135)
|
| 86 |
+
%tmp30_26 = arith.extsi %tmp30 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc136)
|
| 87 |
+
%tmp30_27 = tt.splat %tmp30_25 : i64 -> tensor<1x2048xi64> loc(#loc166)
|
| 88 |
+
%tmp30_28 = arith.addi %tmp30_26, %tmp30_27 : tensor<1x2048xi64> loc(#loc136)
|
| 89 |
+
%tmp30_29 = arith.addi %tmp30_28, %tmp3_16 : tensor<1x2048xi64> loc(#loc137)
|
| 90 |
+
%tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x2048xi64> loc(#loc138)
|
| 91 |
+
%tmp32 = arith.cmpi ne, %tmp31, %cst_3 : tensor<1x2048xi64> loc(#loc139)
|
| 92 |
+
%tmp33 = arith.cmpi slt, %tmp31, %cst_3 : tensor<1x2048xi64> loc(#loc140)
|
| 93 |
+
%tmp34 = arith.cmpi ne, %tmp33, %tmp23_24 : tensor<1x2048xi1> loc(#loc141)
|
| 94 |
+
%tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x2048xi1> loc(#loc142)
|
| 95 |
+
%tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x2048xi64> loc(#loc143)
|
| 96 |
+
%tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc144)
|
| 97 |
+
%tmp39 = arith.cmpi eq, %tmp37, %cst_3 : tensor<1x2048xi64> loc(#loc145)
|
| 98 |
+
%tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x2048xi1> loc(#loc146)
|
| 99 |
+
%tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x2048xi1> loc(#loc147)
|
| 100 |
+
%tmp43 = arith.select %tmp6, %tmp41, %cst_0 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc148)
|
| 101 |
+
%tmp44 = arith.extui %tmp43 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc149)
|
| 102 |
+
%tmp47 = arith.addi %_tmp46_9, %tmp44 : tensor<1x2048xi64> loc(#loc150)
|
| 103 |
+
%_tmp46_30 = arith.andi %r0_mask, %tmp10_21 : tensor<1x2048xi1> loc(#loc151)
|
| 104 |
+
%_tmp46_31 = arith.select %_tmp46_30, %tmp47, %_tmp46_9 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc152)
|
| 105 |
+
scf.yield %_tmp46_31 : tensor<1x2048xi64> loc(#loc63)
|
| 106 |
+
} loc(#loc101)
|
| 107 |
+
%tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({
|
| 108 |
+
^bb0(%tmp46_9: i64 loc(callsite(#loc1 at #loc153)), %tmp46_10: i64 loc(callsite(#loc1 at #loc153))):
|
| 109 |
+
%tmp46_11 = arith.addi %tmp46_9, %tmp46_10 : i64 loc(#loc171)
|
| 110 |
+
tt.reduce.return %tmp46_11 : i64 loc(#loc167)
|
| 111 |
+
}) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc167)
|
| 112 |
+
%tmp46_8 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc154)
|
| 113 |
+
%tmp49 = arith.cmpi sgt, %tmp46_8, %cst : tensor<1x1xi64> loc(#loc155)
|
| 114 |
+
%tmp51 = arith.cmpi slt, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc156)
|
| 115 |
+
%tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc157)
|
| 116 |
+
%tmp54 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc169)
|
| 117 |
+
%tmp55 = arith.cmpi eq, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc160)
|
| 118 |
+
%tmp57 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc170)
|
| 119 |
+
%0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr<i32>, i32 loc(#loc76)
|
| 120 |
+
%1 = tt.splat %0 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc76)
|
| 121 |
+
tt.store %1, %tmp54, %xmask_4 : tensor<1x1x!tt.ptr<i32>> loc(#loc77)
|
| 122 |
+
%2 = tt.addptr %out_ptr2, %xoffset : !tt.ptr<i32>, i32 loc(#loc78)
|
| 123 |
+
%3 = tt.splat %2 : !tt.ptr<i32> -> tensor<1x1x!tt.ptr<i32>> loc(#loc78)
|
| 124 |
+
tt.store %3, %tmp57, %xmask_4 : tensor<1x1x!tt.ptr<i32>> loc(#loc79)
|
| 125 |
+
tt.return loc(#loc80)
|
| 126 |
+
} loc(#loc)
|
| 127 |
+
} loc(#loc)
|
| 128 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":32:40)
|
| 129 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":90:35)
|
| 130 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:28)
|
| 131 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":24:21)
|
| 132 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:27)
|
| 133 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:37)
|
| 134 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:21)
|
| 135 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:28)
|
| 136 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":28:19)
|
| 137 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":29:19)
|
| 138 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":33:31)
|
| 139 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":34:29)
|
| 140 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":37:27)
|
| 141 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":38:27)
|
| 142 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:26)
|
| 143 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:22)
|
| 144 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":41:22)
|
| 145 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:26)
|
| 146 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:22)
|
| 147 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":44:22)
|
| 148 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":45:22)
|
| 149 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":48:23)
|
| 150 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:35)
|
| 151 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:87)
|
| 152 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:94)
|
| 153 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:77)
|
| 154 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":50:23)
|
| 155 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":51:23)
|
| 156 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":52:24)
|
| 157 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":53:23)
|
| 158 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":56:37)
|
| 159 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":57:24)
|
| 160 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":58:24)
|
| 161 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":60:25)
|
| 162 |
+
#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":61:92)
|
| 163 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":62:92)
|
| 164 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":63:25)
|
| 165 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":64:24)
|
| 166 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":65:24)
|
| 167 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":66:39)
|
| 168 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":67:24)
|
| 169 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":68:24)
|
| 170 |
+
#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:24)
|
| 171 |
+
#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:45)
|
| 172 |
+
#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:38)
|
| 173 |
+
#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:51)
|
| 174 |
+
#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":70:25)
|
| 175 |
+
#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":71:25)
|
| 176 |
+
#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":72:92)
|
| 177 |
+
#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":73:25)
|
| 178 |
+
#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":74:24)
|
| 179 |
+
#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":75:24)
|
| 180 |
+
#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":76:39)
|
| 181 |
+
#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":78:25)
|
| 182 |
+
#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":79:24)
|
| 183 |
+
#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":80:24)
|
| 184 |
+
#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":82:38)
|
| 185 |
+
#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":83:25)
|
| 186 |
+
#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":85:25)
|
| 187 |
+
#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:36)
|
| 188 |
+
#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:50)
|
| 189 |
+
#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:8)
|
| 190 |
+
#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 191 |
+
#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 192 |
+
#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:30)
|
| 193 |
+
#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":89:20)
|
| 194 |
+
#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":91:20)
|
| 195 |
+
#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":92:20)
|
| 196 |
+
#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":94:21)
|
| 197 |
+
#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":93:21)
|
| 198 |
+
#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":95:21)
|
| 199 |
+
#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":97:21)
|
| 200 |
+
#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":96:21)
|
| 201 |
+
#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:25)
|
| 202 |
+
#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:37)
|
| 203 |
+
#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:25)
|
| 204 |
+
#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:37)
|
| 205 |
+
#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:4)
|
| 206 |
+
#loc92 = loc("tmp50"(#loc3))
|
| 207 |
+
#loc93 = loc("xoffset"(#loc4))
|
| 208 |
+
#loc94 = loc("xmask"(#loc5))
|
| 209 |
+
#loc95 = loc("r0_base"(#loc6))
|
| 210 |
+
#loc96 = loc("r0_base"(#loc7))
|
| 211 |
+
#loc97 = loc("x1"(#loc8))
|
| 212 |
+
#loc98 = loc("x1"(#loc9))
|
| 213 |
+
#loc99 = loc("x0"(#loc10))
|
| 214 |
+
#loc100 = loc("x2"(#loc11))
|
| 215 |
+
#loc101 = loc("_tmp46"(#loc2))
|
| 216 |
+
#loc102 = loc("r0_index"(#loc12))
|
| 217 |
+
#loc103 = loc("r0_mask"(#loc13))
|
| 218 |
+
#loc104 = loc("r0_4"(#loc14))
|
| 219 |
+
#loc105 = loc("r0_3"(#loc15))
|
| 220 |
+
#loc106 = loc("tmp0"(#loc16))
|
| 221 |
+
#loc107 = loc("tmp0"(#loc17))
|
| 222 |
+
#loc108 = loc("tmp2"(#loc18))
|
| 223 |
+
#loc109 = loc("tmp3"(#loc19))
|
| 224 |
+
#loc110 = loc("tmp3"(#loc20))
|
| 225 |
+
#loc111 = loc("tmp5"(#loc21))
|
| 226 |
+
#loc112 = loc("tmp6"(#loc22))
|
| 227 |
+
#loc113 = loc("tmp9"(#loc23))
|
| 228 |
+
#loc114 = loc("tmp10"(#loc24))
|
| 229 |
+
#loc115 = loc("tmp10"(#loc25))
|
| 230 |
+
#loc116 = loc("tmp10"(#loc26))
|
| 231 |
+
#loc117 = loc("tmp10"(#loc27))
|
| 232 |
+
#loc118 = loc("tmp11"(#loc28))
|
| 233 |
+
#loc119 = loc("tmp12"(#loc29))
|
| 234 |
+
#loc120 = loc("tmp13"(#loc30))
|
| 235 |
+
#loc121 = loc("tmp14"(#loc31))
|
| 236 |
+
#loc122 = loc("tmp17"(#loc32))
|
| 237 |
+
#loc123 = loc("tmp18"(#loc33))
|
| 238 |
+
#loc124 = loc("tmp19"(#loc34))
|
| 239 |
+
#loc125 = loc("tmp21"(#loc35))
|
| 240 |
+
#loc126 = loc("tmp22"(#loc36))
|
| 241 |
+
#loc127 = loc("tmp23"(#loc37))
|
| 242 |
+
#loc128 = loc("tmp24"(#loc38))
|
| 243 |
+
#loc129 = loc("tmp25"(#loc39))
|
| 244 |
+
#loc130 = loc("tmp26"(#loc40))
|
| 245 |
+
#loc131 = loc("tmp27"(#loc41))
|
| 246 |
+
#loc132 = loc("tmp28"(#loc42))
|
| 247 |
+
#loc133 = loc("tmp29"(#loc43))
|
| 248 |
+
#loc134 = loc("tmp30"(#loc44))
|
| 249 |
+
#loc135 = loc("tmp30"(#loc45))
|
| 250 |
+
#loc136 = loc("tmp30"(#loc46))
|
| 251 |
+
#loc137 = loc("tmp30"(#loc47))
|
| 252 |
+
#loc138 = loc("tmp31"(#loc48))
|
| 253 |
+
#loc139 = loc("tmp32"(#loc49))
|
| 254 |
+
#loc140 = loc("tmp33"(#loc50))
|
| 255 |
+
#loc141 = loc("tmp34"(#loc51))
|
| 256 |
+
#loc142 = loc("tmp35"(#loc52))
|
| 257 |
+
#loc143 = loc("tmp36"(#loc53))
|
| 258 |
+
#loc144 = loc("tmp37"(#loc54))
|
| 259 |
+
#loc145 = loc("tmp39"(#loc55))
|
| 260 |
+
#loc146 = loc("tmp40"(#loc56))
|
| 261 |
+
#loc147 = loc("tmp41"(#loc57))
|
| 262 |
+
#loc148 = loc("tmp43"(#loc58))
|
| 263 |
+
#loc149 = loc("tmp44"(#loc59))
|
| 264 |
+
#loc150 = loc("tmp47"(#loc60))
|
| 265 |
+
#loc151 = loc("_tmp46"(#loc61))
|
| 266 |
+
#loc152 = loc("_tmp46"(#loc62))
|
| 267 |
+
#loc154 = loc("tmp46"(#loc67))
|
| 268 |
+
#loc155 = loc("tmp49"(#loc68))
|
| 269 |
+
#loc156 = loc("tmp51"(#loc69))
|
| 270 |
+
#loc157 = loc("tmp52"(#loc70))
|
| 271 |
+
#loc158 = loc("tmp54"(#loc71))
|
| 272 |
+
#loc159 = loc("tmp53"(#loc72))
|
| 273 |
+
#loc160 = loc("tmp55"(#loc73))
|
| 274 |
+
#loc161 = loc("tmp57"(#loc74))
|
| 275 |
+
#loc162 = loc("tmp56"(#loc75))
|
| 276 |
+
#loc163 = loc(fused[#loc107, #loc106])
|
| 277 |
+
#loc164 = loc(fused[#loc110, #loc109])
|
| 278 |
+
#loc165 = loc(fused[#loc116, #loc94])
|
| 279 |
+
#loc166 = loc(fused[#loc136, #loc135])
|
| 280 |
+
#loc167 = loc(callsite(#loc64 at #loc153))
|
| 281 |
+
#loc169 = loc(fused[#loc158, #loc159])
|
| 282 |
+
#loc170 = loc(fused[#loc161, #loc162])
|
| 283 |
+
#loc171 = loc(callsite(#loc66 at #loc167))
|
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}}
|
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin
ADDED
|
Binary file (43.9 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "dcf9c3d58e1d47ce5613d5d0ceadd19ad1e3325e6f9b526d3b2abd496024081a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 512, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"}
|
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir
ADDED
|
@@ -0,0 +1,781 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
@global_smem = external addrspace(3) global [0 x i8], align 16
|
| 6 |
+
|
| 7 |
+
; Function Attrs: nounwind
|
| 8 |
+
define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 {
|
| 9 |
+
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
|
| 10 |
+
%9 = shl i32 %8, 3, !dbg !8
|
| 11 |
+
%10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
|
| 12 |
+
%11 = and i32 %10, 24, !dbg !9
|
| 13 |
+
%12 = lshr i32 %10, 5, !dbg !9
|
| 14 |
+
%13 = and i32 %10, 7, !dbg !9
|
| 15 |
+
%14 = lshr i32 %10, 3, !dbg !9
|
| 16 |
+
%15 = and i32 %14, 7, !dbg !9
|
| 17 |
+
%16 = or disjoint i32 %9, %13, !dbg !10
|
| 18 |
+
%17 = or disjoint i32 %15, %9, !dbg !10
|
| 19 |
+
%18 = icmp slt i32 %16, 32, !dbg !11
|
| 20 |
+
%19 = icmp slt i32 %17, 32, !dbg !11
|
| 21 |
+
%20 = or disjoint i32 %15, 8, !dbg !12
|
| 22 |
+
%21 = shl nuw nsw i32 %13, 1, !dbg !12
|
| 23 |
+
%22 = sdiv i32 %16, 16, !dbg !13
|
| 24 |
+
%23 = mul nuw nsw i32 %15, 17, !dbg !14
|
| 25 |
+
%24 = mul nuw nsw i32 %20, 17, !dbg !14
|
| 26 |
+
%25 = shl i32 %22, 8, !dbg !15
|
| 27 |
+
%26 = add i32 %25, %16, !dbg !15
|
| 28 |
+
%27 = add i32 %26, %23, !dbg !16
|
| 29 |
+
%28 = add i32 %26, %24, !dbg !16
|
| 30 |
+
%29 = sext i32 %27 to i64, !dbg !17
|
| 31 |
+
%30 = getelementptr i32, ptr addrspace(1) %0, i64 %29, !dbg !17
|
| 32 |
+
%31 = sext i32 %28 to i64, !dbg !17
|
| 33 |
+
%32 = getelementptr i32, ptr addrspace(1) %0, i64 %31, !dbg !17
|
| 34 |
+
%33 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %30, i1 %18) #4, !dbg !18
|
| 35 |
+
%34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %32, i1 %18) #4, !dbg !18
|
| 36 |
+
%35 = lshr i32 %10, 4, !dbg !19
|
| 37 |
+
%.lobit = and i32 %35, 1, !dbg !19
|
| 38 |
+
%36 = and i32 %10, 8, !dbg !19
|
| 39 |
+
%.not = icmp eq i32 %36, 0, !dbg !19
|
| 40 |
+
%.lobit1 = lshr exact i32 %36, 3, !dbg !19
|
| 41 |
+
%37 = and i32 %10, 32, !dbg !19
|
| 42 |
+
%.not3 = icmp eq i32 %37, 0, !dbg !19
|
| 43 |
+
%.lobit2 = lshr exact i32 %37, 5, !dbg !19
|
| 44 |
+
%38 = xor i32 %.lobit1, 1, !dbg !23
|
| 45 |
+
%39 = xor i32 %.lobit, 1, !dbg !23
|
| 46 |
+
%40 = xor i32 %.lobit2, 1, !dbg !23
|
| 47 |
+
%41 = mul nuw nsw i32 %33, %38, !dbg !24
|
| 48 |
+
%42 = mul nuw nsw i32 %34, %38, !dbg !24
|
| 49 |
+
%43 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 8, i32 31), !dbg !25
|
| 50 |
+
%44 = add i32 %43, %41, !dbg !28
|
| 51 |
+
%45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %42, i32 8, i32 31), !dbg !25
|
| 52 |
+
%46 = add i32 %45, %42, !dbg !28
|
| 53 |
+
%47 = mul nuw nsw i32 %33, %.lobit1, !dbg !29
|
| 54 |
+
%48 = mul nuw nsw i32 %34, %.lobit1, !dbg !29
|
| 55 |
+
%49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 8, i32 31), !dbg !25
|
| 56 |
+
%50 = add i32 %49, %47, !dbg !28
|
| 57 |
+
%51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !25
|
| 58 |
+
%52 = add i32 %51, %48, !dbg !28
|
| 59 |
+
%53 = mul nuw nsw i32 %38, %15, !dbg !30
|
| 60 |
+
%54 = mul nuw nsw i32 %20, %38, !dbg !30
|
| 61 |
+
%55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 8, i32 31), !dbg !25
|
| 62 |
+
%56 = add i32 %55, %53, !dbg !28
|
| 63 |
+
%57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 8, i32 31), !dbg !25
|
| 64 |
+
%58 = add i32 %57, %54, !dbg !28
|
| 65 |
+
%59 = mul nuw nsw i32 %15, %.lobit1, !dbg !31
|
| 66 |
+
%60 = mul nuw nsw i32 %20, %.lobit1, !dbg !31
|
| 67 |
+
%61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 8, i32 31), !dbg !25
|
| 68 |
+
%62 = add i32 %61, %59, !dbg !28
|
| 69 |
+
%63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 8, i32 31), !dbg !25
|
| 70 |
+
%64 = add i32 %63, %60, !dbg !28
|
| 71 |
+
%65 = trunc i32 %35 to i1, !dbg !32
|
| 72 |
+
%66 = icmp sge i32 %44, %50, !dbg !32
|
| 73 |
+
%67 = icmp ne i32 %44, %50, !dbg !32
|
| 74 |
+
%68 = icmp sle i32 %56, %62, !dbg !32
|
| 75 |
+
%69 = or i1 %67, %68, !dbg !32
|
| 76 |
+
%70 = and i1 %66, %69, !dbg !32
|
| 77 |
+
%.not4 = xor i1 %70, %65, !dbg !32
|
| 78 |
+
%71 = icmp sge i32 %46, %52, !dbg !32
|
| 79 |
+
%72 = icmp ne i32 %46, %52, !dbg !32
|
| 80 |
+
%73 = icmp sle i32 %58, %64, !dbg !32
|
| 81 |
+
%74 = or i1 %72, %73, !dbg !32
|
| 82 |
+
%75 = and i1 %71, %74, !dbg !32
|
| 83 |
+
%.not5 = xor i1 %75, %65, !dbg !32
|
| 84 |
+
%76 = xor i32 %50, %44, !dbg !33
|
| 85 |
+
%77 = xor i32 %52, %46, !dbg !33
|
| 86 |
+
%78 = select i1 %.not4, i32 0, i32 %76, !dbg !34
|
| 87 |
+
%79 = select i1 %.not5, i32 0, i32 %77, !dbg !34
|
| 88 |
+
%80 = xor i32 %78, %33, !dbg !35
|
| 89 |
+
%81 = xor i32 %79, %34, !dbg !35
|
| 90 |
+
%82 = xor i32 %62, %56, !dbg !36
|
| 91 |
+
%83 = xor i32 %64, %58, !dbg !36
|
| 92 |
+
%84 = select i1 %.not4, i32 0, i32 %82, !dbg !37
|
| 93 |
+
%85 = select i1 %.not5, i32 0, i32 %83, !dbg !37
|
| 94 |
+
%86 = xor i32 %84, %15, !dbg !38
|
| 95 |
+
%87 = xor i32 %85, %20, !dbg !38
|
| 96 |
+
%88 = mul nuw nsw i32 %80, %39, !dbg !24
|
| 97 |
+
%89 = mul nuw nsw i32 %81, %39, !dbg !24
|
| 98 |
+
%90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 16, i32 31), !dbg !25
|
| 99 |
+
%91 = add i32 %88, %90, !dbg !28
|
| 100 |
+
%92 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 16, i32 31), !dbg !25
|
| 101 |
+
%93 = add i32 %89, %92, !dbg !28
|
| 102 |
+
%94 = mul nuw nsw i32 %80, %.lobit, !dbg !29
|
| 103 |
+
%95 = mul nuw nsw i32 %81, %.lobit, !dbg !29
|
| 104 |
+
%96 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 16, i32 31), !dbg !25
|
| 105 |
+
%97 = add i32 %94, %96, !dbg !28
|
| 106 |
+
%98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %95, i32 16, i32 31), !dbg !25
|
| 107 |
+
%99 = add i32 %95, %98, !dbg !28
|
| 108 |
+
%100 = mul nuw nsw i32 %86, %39, !dbg !30
|
| 109 |
+
%101 = mul nuw nsw i32 %87, %39, !dbg !30
|
| 110 |
+
%102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 16, i32 31), !dbg !25
|
| 111 |
+
%103 = add i32 %100, %102, !dbg !28
|
| 112 |
+
%104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 16, i32 31), !dbg !25
|
| 113 |
+
%105 = add i32 %101, %104, !dbg !28
|
| 114 |
+
%106 = mul nuw nsw i32 %86, %.lobit, !dbg !31
|
| 115 |
+
%107 = mul nuw nsw i32 %87, %.lobit, !dbg !31
|
| 116 |
+
%108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !25
|
| 117 |
+
%109 = add i32 %106, %108, !dbg !28
|
| 118 |
+
%110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 16, i32 31), !dbg !25
|
| 119 |
+
%111 = add i32 %107, %110, !dbg !28
|
| 120 |
+
%112 = icmp slt i32 %91, %97, !dbg !39
|
| 121 |
+
%113 = icmp slt i32 %93, %99, !dbg !39
|
| 122 |
+
%114 = icmp eq i32 %91, %97, !dbg !40
|
| 123 |
+
%115 = icmp eq i32 %93, %99, !dbg !40
|
| 124 |
+
%116 = icmp sgt i32 %103, %109, !dbg !41
|
| 125 |
+
%117 = icmp sgt i32 %105, %111, !dbg !41
|
| 126 |
+
%118 = and i1 %114, %116, !dbg !42
|
| 127 |
+
%119 = and i1 %115, %117, !dbg !42
|
| 128 |
+
%120 = or i1 %112, %118, !dbg !43
|
| 129 |
+
%121 = or i1 %113, %119, !dbg !43
|
| 130 |
+
%122 = zext i1 %120 to i32, !dbg !44
|
| 131 |
+
%123 = zext i1 %121 to i32, !dbg !44
|
| 132 |
+
%.not6 = icmp eq i32 %.lobit2, %122, !dbg !32
|
| 133 |
+
%.not7 = icmp eq i32 %.lobit2, %123, !dbg !32
|
| 134 |
+
%124 = xor i32 %91, %97, !dbg !33
|
| 135 |
+
%125 = xor i32 %93, %99, !dbg !33
|
| 136 |
+
%126 = select i1 %.not6, i32 0, i32 %124, !dbg !34
|
| 137 |
+
%127 = select i1 %.not7, i32 0, i32 %125, !dbg !34
|
| 138 |
+
%128 = xor i32 %126, %80, !dbg !35
|
| 139 |
+
%129 = xor i32 %127, %81, !dbg !35
|
| 140 |
+
%130 = xor i32 %103, %109, !dbg !36
|
| 141 |
+
%131 = xor i32 %105, %111, !dbg !36
|
| 142 |
+
%132 = select i1 %.not6, i32 0, i32 %130, !dbg !37
|
| 143 |
+
%133 = select i1 %.not7, i32 0, i32 %131, !dbg !37
|
| 144 |
+
%134 = xor i32 %132, %86, !dbg !38
|
| 145 |
+
%135 = xor i32 %133, %87, !dbg !38
|
| 146 |
+
%136 = mul nuw nsw i32 %128, %38, !dbg !24
|
| 147 |
+
%137 = mul nuw nsw i32 %129, %38, !dbg !24
|
| 148 |
+
%138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !25
|
| 149 |
+
%139 = add i32 %136, %138, !dbg !28
|
| 150 |
+
%140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 8, i32 31), !dbg !25
|
| 151 |
+
%141 = add i32 %137, %140, !dbg !28
|
| 152 |
+
%142 = mul nuw nsw i32 %128, %.lobit1, !dbg !29
|
| 153 |
+
%143 = mul nuw nsw i32 %129, %.lobit1, !dbg !29
|
| 154 |
+
%144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 8, i32 31), !dbg !25
|
| 155 |
+
%145 = add i32 %142, %144, !dbg !28
|
| 156 |
+
%146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %143, i32 8, i32 31), !dbg !25
|
| 157 |
+
%147 = add i32 %143, %146, !dbg !28
|
| 158 |
+
%148 = mul nuw nsw i32 %134, %38, !dbg !30
|
| 159 |
+
%149 = mul nuw nsw i32 %135, %38, !dbg !30
|
| 160 |
+
%150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 8, i32 31), !dbg !25
|
| 161 |
+
%151 = add i32 %148, %150, !dbg !28
|
| 162 |
+
%152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 8, i32 31), !dbg !25
|
| 163 |
+
%153 = add i32 %149, %152, !dbg !28
|
| 164 |
+
%154 = mul nuw nsw i32 %134, %.lobit1, !dbg !31
|
| 165 |
+
%155 = mul nuw nsw i32 %135, %.lobit1, !dbg !31
|
| 166 |
+
%156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 8, i32 31), !dbg !25
|
| 167 |
+
%157 = add i32 %154, %156, !dbg !28
|
| 168 |
+
%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 8, i32 31), !dbg !25
|
| 169 |
+
%159 = add i32 %155, %158, !dbg !28
|
| 170 |
+
%160 = icmp slt i32 %139, %145, !dbg !39
|
| 171 |
+
%161 = icmp slt i32 %141, %147, !dbg !39
|
| 172 |
+
%162 = icmp eq i32 %139, %145, !dbg !40
|
| 173 |
+
%163 = icmp eq i32 %141, %147, !dbg !40
|
| 174 |
+
%164 = icmp sgt i32 %151, %157, !dbg !41
|
| 175 |
+
%165 = icmp sgt i32 %153, %159, !dbg !41
|
| 176 |
+
%166 = and i1 %162, %164, !dbg !42
|
| 177 |
+
%167 = and i1 %163, %165, !dbg !42
|
| 178 |
+
%168 = or i1 %160, %166, !dbg !43
|
| 179 |
+
%169 = or i1 %161, %167, !dbg !43
|
| 180 |
+
%170 = zext i1 %168 to i32, !dbg !44
|
| 181 |
+
%171 = zext i1 %169 to i32, !dbg !44
|
| 182 |
+
%.not8 = icmp eq i32 %.lobit2, %170, !dbg !32
|
| 183 |
+
%.not9 = icmp eq i32 %.lobit2, %171, !dbg !32
|
| 184 |
+
%172 = xor i32 %139, %145, !dbg !33
|
| 185 |
+
%173 = xor i32 %141, %147, !dbg !33
|
| 186 |
+
%174 = select i1 %.not8, i32 0, i32 %172, !dbg !34
|
| 187 |
+
%175 = select i1 %.not9, i32 0, i32 %173, !dbg !34
|
| 188 |
+
%176 = xor i32 %174, %128, !dbg !35
|
| 189 |
+
%177 = xor i32 %175, %129, !dbg !35
|
| 190 |
+
%178 = xor i32 %151, %157, !dbg !36
|
| 191 |
+
%179 = xor i32 %153, %159, !dbg !36
|
| 192 |
+
%180 = select i1 %.not8, i32 0, i32 %178, !dbg !37
|
| 193 |
+
%181 = select i1 %.not9, i32 0, i32 %179, !dbg !37
|
| 194 |
+
%182 = xor i32 %180, %134, !dbg !38
|
| 195 |
+
%183 = xor i32 %181, %135, !dbg !38
|
| 196 |
+
%184 = mul nuw nsw i32 %176, %40, !dbg !24
|
| 197 |
+
%185 = mul nuw nsw i32 %177, %40, !dbg !24
|
| 198 |
+
%186 = and i32 %12, 1, !dbg !25
|
| 199 |
+
%187 = shl nuw nsw i32 %10, 1, !dbg !25
|
| 200 |
+
%188 = and i32 %187, 48, !dbg !25
|
| 201 |
+
%189 = or disjoint i32 %188, %21, !dbg !25
|
| 202 |
+
%.idx = shl nuw nsw i32 %189, 3, !dbg !25
|
| 203 |
+
%190 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !25
|
| 204 |
+
%191 = getelementptr i32, ptr addrspace(3) %190, i32 %186, !dbg !25
|
| 205 |
+
%192 = insertelement <1 x i32> poison, i32 %184, i64 0, !dbg !25
|
| 206 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %192, i1 true) #4, !dbg !25
|
| 207 |
+
%193 = getelementptr i8, ptr addrspace(3) %190, i32 8, !dbg !25
|
| 208 |
+
%194 = getelementptr i32, ptr addrspace(3) %193, i32 %186, !dbg !25
|
| 209 |
+
%195 = insertelement <1 x i32> poison, i32 %185, i64 0, !dbg !25
|
| 210 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %195, i1 true) #4, !dbg !25
|
| 211 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 212 |
+
%196 = icmp samesign ult i32 %10, 128, !dbg !25
|
| 213 |
+
%197 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %10, !dbg !25
|
| 214 |
+
%198 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
|
| 215 |
+
%199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 1, i32 31), !dbg !25
|
| 216 |
+
%200 = add i32 %199, %198, !dbg !28
|
| 217 |
+
%201 = and i32 %10, 897, !dbg !25
|
| 218 |
+
%202 = icmp eq i32 %201, 0, !dbg !25
|
| 219 |
+
%203 = insertelement <1 x i32> poison, i32 %200, i64 0, !dbg !25
|
| 220 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %203, i1 %202) #4, !dbg !25
|
| 221 |
+
%204 = getelementptr i8, ptr addrspace(3) %197, i32 256, !dbg !25
|
| 222 |
+
%205 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
|
| 223 |
+
%206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 1, i32 31), !dbg !25
|
| 224 |
+
%207 = add i32 %206, %205, !dbg !28
|
| 225 |
+
%208 = insertelement <1 x i32> poison, i32 %207, i64 0, !dbg !25
|
| 226 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %208, i1 %202) #4, !dbg !25
|
| 227 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 228 |
+
%209 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
|
| 229 |
+
%210 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
|
| 230 |
+
%211 = mul nuw nsw i32 %176, %.lobit2, !dbg !29
|
| 231 |
+
%212 = mul nuw nsw i32 %177, %.lobit2, !dbg !29
|
| 232 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 233 |
+
%213 = insertelement <1 x i32> poison, i32 %211, i64 0, !dbg !25
|
| 234 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %213, i1 true) #4, !dbg !25
|
| 235 |
+
%214 = insertelement <1 x i32> poison, i32 %212, i64 0, !dbg !25
|
| 236 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %214, i1 true) #4, !dbg !25
|
| 237 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 238 |
+
%215 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
|
| 239 |
+
%216 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 1, i32 31), !dbg !25
|
| 240 |
+
%217 = add i32 %216, %215, !dbg !28
|
| 241 |
+
%218 = insertelement <1 x i32> poison, i32 %217, i64 0, !dbg !25
|
| 242 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %218, i1 %202) #4, !dbg !25
|
| 243 |
+
%219 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
|
| 244 |
+
%220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !25
|
| 245 |
+
%221 = add i32 %220, %219, !dbg !28
|
| 246 |
+
%222 = insertelement <1 x i32> poison, i32 %221, i64 0, !dbg !25
|
| 247 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %222, i1 %202) #4, !dbg !25
|
| 248 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 249 |
+
%223 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
|
| 250 |
+
%224 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
|
| 251 |
+
%225 = mul nuw nsw i32 %182, %40, !dbg !30
|
| 252 |
+
%226 = mul nuw nsw i32 %183, %40, !dbg !30
|
| 253 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 254 |
+
%227 = insertelement <1 x i32> poison, i32 %225, i64 0, !dbg !25
|
| 255 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %227, i1 true) #4, !dbg !25
|
| 256 |
+
%228 = insertelement <1 x i32> poison, i32 %226, i64 0, !dbg !25
|
| 257 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %228, i1 true) #4, !dbg !25
|
| 258 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 259 |
+
%229 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
|
| 260 |
+
%230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 1, i32 31), !dbg !25
|
| 261 |
+
%231 = add i32 %230, %229, !dbg !28
|
| 262 |
+
%232 = insertelement <1 x i32> poison, i32 %231, i64 0, !dbg !25
|
| 263 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %232, i1 %202) #4, !dbg !25
|
| 264 |
+
%233 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
|
| 265 |
+
%234 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %233, i32 1, i32 31), !dbg !25
|
| 266 |
+
%235 = add i32 %234, %233, !dbg !28
|
| 267 |
+
%236 = insertelement <1 x i32> poison, i32 %235, i64 0, !dbg !25
|
| 268 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %236, i1 %202) #4, !dbg !25
|
| 269 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 270 |
+
%237 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
|
| 271 |
+
%238 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
|
| 272 |
+
%239 = mul nuw nsw i32 %182, %.lobit2, !dbg !31
|
| 273 |
+
%240 = mul nuw nsw i32 %183, %.lobit2, !dbg !31
|
| 274 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 275 |
+
%241 = insertelement <1 x i32> poison, i32 %239, i64 0, !dbg !25
|
| 276 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %241, i1 true) #4, !dbg !25
|
| 277 |
+
%242 = insertelement <1 x i32> poison, i32 %240, i64 0, !dbg !25
|
| 278 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %242, i1 true) #4, !dbg !25
|
| 279 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 280 |
+
%243 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
|
| 281 |
+
%244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 1, i32 31), !dbg !25
|
| 282 |
+
%245 = add i32 %244, %243, !dbg !28
|
| 283 |
+
%246 = insertelement <1 x i32> poison, i32 %245, i64 0, !dbg !25
|
| 284 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %246, i1 %202) #4, !dbg !25
|
| 285 |
+
%247 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
|
| 286 |
+
%248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 1, i32 31), !dbg !25
|
| 287 |
+
%249 = add i32 %248, %247, !dbg !28
|
| 288 |
+
%250 = insertelement <1 x i32> poison, i32 %249, i64 0, !dbg !25
|
| 289 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %250, i1 %202) #4, !dbg !25
|
| 290 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 291 |
+
%251 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
|
| 292 |
+
%252 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
|
| 293 |
+
%253 = icmp slt i32 %209, %223, !dbg !39
|
| 294 |
+
%254 = icmp sge i32 %210, %224, !dbg !39
|
| 295 |
+
%255 = icmp eq i32 %209, %223, !dbg !40
|
| 296 |
+
%256 = icmp ne i32 %210, %224, !dbg !40
|
| 297 |
+
%257 = icmp sgt i32 %237, %251, !dbg !41
|
| 298 |
+
%258 = icmp sle i32 %238, %252, !dbg !41
|
| 299 |
+
%259 = and i1 %255, %257, !dbg !42
|
| 300 |
+
%.not15 = or i1 %256, %258, !dbg !43
|
| 301 |
+
%260 = or i1 %253, %259, !dbg !43
|
| 302 |
+
%.not12 = and i1 %254, %.not15, !dbg !44
|
| 303 |
+
%261 = xor i32 %223, %209, !dbg !33
|
| 304 |
+
%262 = xor i32 %224, %210, !dbg !33
|
| 305 |
+
%263 = select i1 %260, i32 %261, i32 0, !dbg !34
|
| 306 |
+
%264 = select i1 %.not12, i32 %262, i32 0, !dbg !34
|
| 307 |
+
%265 = xor i32 %263, %176, !dbg !35
|
| 308 |
+
%266 = xor i32 %264, %177, !dbg !35
|
| 309 |
+
%267 = xor i32 %251, %237, !dbg !36
|
| 310 |
+
%268 = xor i32 %252, %238, !dbg !36
|
| 311 |
+
%269 = select i1 %260, i32 %267, i32 0, !dbg !37
|
| 312 |
+
%270 = select i1 %.not12, i32 %268, i32 0, !dbg !37
|
| 313 |
+
%271 = xor i32 %269, %182, !dbg !38
|
| 314 |
+
%272 = xor i32 %270, %183, !dbg !38
|
| 315 |
+
%273 = mul nuw nsw i32 %265, %39, !dbg !24
|
| 316 |
+
%274 = mul nuw nsw i32 %266, %39, !dbg !24
|
| 317 |
+
%275 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 16, i32 31), !dbg !25
|
| 318 |
+
%276 = add i32 %273, %275, !dbg !28
|
| 319 |
+
%277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %274, i32 16, i32 31), !dbg !25
|
| 320 |
+
%278 = add i32 %274, %277, !dbg !28
|
| 321 |
+
%279 = mul nuw nsw i32 %265, %.lobit, !dbg !29
|
| 322 |
+
%280 = mul nuw nsw i32 %266, %.lobit, !dbg !29
|
| 323 |
+
%281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %279, i32 16, i32 31), !dbg !25
|
| 324 |
+
%282 = add i32 %279, %281, !dbg !28
|
| 325 |
+
%283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 16, i32 31), !dbg !25
|
| 326 |
+
%284 = add i32 %280, %283, !dbg !28
|
| 327 |
+
%285 = mul nuw nsw i32 %271, %39, !dbg !30
|
| 328 |
+
%286 = mul nuw nsw i32 %272, %39, !dbg !30
|
| 329 |
+
%287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 16, i32 31), !dbg !25
|
| 330 |
+
%288 = add i32 %285, %287, !dbg !28
|
| 331 |
+
%289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 16, i32 31), !dbg !25
|
| 332 |
+
%290 = add i32 %286, %289, !dbg !28
|
| 333 |
+
%291 = mul nuw nsw i32 %271, %.lobit, !dbg !31
|
| 334 |
+
%292 = mul nuw nsw i32 %272, %.lobit, !dbg !31
|
| 335 |
+
%293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %291, i32 16, i32 31), !dbg !25
|
| 336 |
+
%294 = add i32 %293, %291, !dbg !28
|
| 337 |
+
%295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 16, i32 31), !dbg !25
|
| 338 |
+
%296 = add i32 %295, %292, !dbg !28
|
| 339 |
+
%297 = icmp slt i32 %276, %282, !dbg !39
|
| 340 |
+
%298 = icmp sge i32 %278, %284, !dbg !39
|
| 341 |
+
%299 = icmp eq i32 %276, %282, !dbg !40
|
| 342 |
+
%300 = icmp ne i32 %278, %284, !dbg !40
|
| 343 |
+
%301 = icmp sgt i32 %288, %294, !dbg !41
|
| 344 |
+
%302 = icmp sle i32 %290, %296, !dbg !41
|
| 345 |
+
%303 = and i1 %299, %301, !dbg !42
|
| 346 |
+
%.not21 = or i1 %300, %302, !dbg !43
|
| 347 |
+
%304 = or i1 %297, %303, !dbg !43
|
| 348 |
+
%.not18 = and i1 %298, %.not21, !dbg !44
|
| 349 |
+
%305 = xor i32 %276, %282, !dbg !33
|
| 350 |
+
%306 = xor i32 %278, %284, !dbg !33
|
| 351 |
+
%307 = select i1 %304, i32 %305, i32 0, !dbg !34
|
| 352 |
+
%308 = select i1 %.not18, i32 %306, i32 0, !dbg !34
|
| 353 |
+
%309 = xor i32 %307, %265, !dbg !35
|
| 354 |
+
%310 = xor i32 %308, %266, !dbg !35
|
| 355 |
+
%311 = xor i32 %294, %288, !dbg !36
|
| 356 |
+
%312 = xor i32 %296, %290, !dbg !36
|
| 357 |
+
%313 = select i1 %304, i32 %311, i32 0, !dbg !37
|
| 358 |
+
%314 = select i1 %.not18, i32 %312, i32 0, !dbg !37
|
| 359 |
+
%315 = xor i32 %313, %271, !dbg !38
|
| 360 |
+
%316 = xor i32 %314, %272, !dbg !38
|
| 361 |
+
%317 = mul nuw nsw i32 %309, %38, !dbg !24
|
| 362 |
+
%318 = mul nuw nsw i32 %310, %38, !dbg !24
|
| 363 |
+
%319 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 8, i32 31), !dbg !25
|
| 364 |
+
%320 = add i32 %317, %319, !dbg !28
|
| 365 |
+
%321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %318, i32 8, i32 31), !dbg !25
|
| 366 |
+
%322 = add i32 %318, %321, !dbg !28
|
| 367 |
+
%323 = mul nuw nsw i32 %309, %.lobit1, !dbg !29
|
| 368 |
+
%324 = mul nuw nsw i32 %310, %.lobit1, !dbg !29
|
| 369 |
+
%325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %323, i32 8, i32 31), !dbg !25
|
| 370 |
+
%326 = add i32 %323, %325, !dbg !28
|
| 371 |
+
%327 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 8, i32 31), !dbg !25
|
| 372 |
+
%328 = add i32 %324, %327, !dbg !28
|
| 373 |
+
%329 = mul nuw nsw i32 %315, %38, !dbg !30
|
| 374 |
+
%330 = mul nuw nsw i32 %316, %38, !dbg !30
|
| 375 |
+
%331 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %329, i32 8, i32 31), !dbg !25
|
| 376 |
+
%332 = add i32 %329, %331, !dbg !28
|
| 377 |
+
%333 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %330, i32 8, i32 31), !dbg !25
|
| 378 |
+
%334 = add i32 %330, %333, !dbg !28
|
| 379 |
+
%335 = mul nuw nsw i32 %315, %.lobit1, !dbg !31
|
| 380 |
+
%336 = mul nuw nsw i32 %316, %.lobit1, !dbg !31
|
| 381 |
+
%337 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %335, i32 8, i32 31), !dbg !25
|
| 382 |
+
%338 = add i32 %337, %335, !dbg !28
|
| 383 |
+
%339 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 8, i32 31), !dbg !25
|
| 384 |
+
%340 = add i32 %339, %336, !dbg !28
|
| 385 |
+
%341 = icmp slt i32 %320, %326, !dbg !39
|
| 386 |
+
%342 = icmp sge i32 %322, %328, !dbg !39
|
| 387 |
+
%343 = icmp eq i32 %320, %326, !dbg !40
|
| 388 |
+
%344 = icmp ne i32 %322, %328, !dbg !40
|
| 389 |
+
%345 = icmp sgt i32 %332, %338, !dbg !41
|
| 390 |
+
%346 = icmp sle i32 %334, %340, !dbg !41
|
| 391 |
+
%347 = and i1 %343, %345, !dbg !42
|
| 392 |
+
%.not27 = or i1 %344, %346, !dbg !43
|
| 393 |
+
%348 = or i1 %341, %347, !dbg !43
|
| 394 |
+
%.not24 = and i1 %342, %.not27, !dbg !44
|
| 395 |
+
%349 = xor i32 %320, %326, !dbg !33
|
| 396 |
+
%350 = xor i32 %322, %328, !dbg !33
|
| 397 |
+
%351 = select i1 %348, i32 %349, i32 0, !dbg !34
|
| 398 |
+
%352 = select i1 %.not24, i32 %350, i32 0, !dbg !34
|
| 399 |
+
%353 = xor i32 %351, %309, !dbg !35
|
| 400 |
+
%354 = xor i32 %352, %310, !dbg !35
|
| 401 |
+
%355 = xor i32 %338, %332, !dbg !36
|
| 402 |
+
%356 = xor i32 %340, %334, !dbg !36
|
| 403 |
+
%357 = select i1 %348, i32 %355, i32 0, !dbg !37
|
| 404 |
+
%358 = select i1 %.not24, i32 %356, i32 0, !dbg !37
|
| 405 |
+
%359 = xor i32 %357, %315, !dbg !38
|
| 406 |
+
%360 = xor i32 %358, %316, !dbg !38
|
| 407 |
+
%361 = icmp slt i32 %353, %354, !dbg !39
|
| 408 |
+
%362 = icmp eq i32 %353, %354, !dbg !40
|
| 409 |
+
%363 = icmp sgt i32 %359, %360, !dbg !41
|
| 410 |
+
%364 = and i1 %362, %363, !dbg !42
|
| 411 |
+
%365 = or i1 %361, %364, !dbg !43
|
| 412 |
+
%366 = xor i32 %354, %353, !dbg !33
|
| 413 |
+
%367 = select i1 %365, i32 %366, i32 0, !dbg !34
|
| 414 |
+
%368 = xor i32 %367, %353, !dbg !35
|
| 415 |
+
%369 = xor i32 %367, %354, !dbg !35
|
| 416 |
+
%370 = xor i32 %360, %359, !dbg !36
|
| 417 |
+
%371 = select i1 %365, i32 %370, i32 0, !dbg !37
|
| 418 |
+
%372 = xor i32 %371, %359, !dbg !38
|
| 419 |
+
%373 = xor i32 %371, %360, !dbg !38
|
| 420 |
+
%374 = mul nuw nsw i32 %368, %40, !dbg !24
|
| 421 |
+
%375 = mul nuw nsw i32 %369, %40, !dbg !24
|
| 422 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 423 |
+
%376 = insertelement <1 x i32> poison, i32 %374, i64 0, !dbg !25
|
| 424 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %376, i1 true) #4, !dbg !25
|
| 425 |
+
%377 = insertelement <1 x i32> poison, i32 %375, i64 0, !dbg !25
|
| 426 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %377, i1 true) #4, !dbg !25
|
| 427 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 428 |
+
%378 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
|
| 429 |
+
%379 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %378, i32 1, i32 31), !dbg !25
|
| 430 |
+
%380 = add i32 %379, %378, !dbg !28
|
| 431 |
+
%381 = insertelement <1 x i32> poison, i32 %380, i64 0, !dbg !25
|
| 432 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %381, i1 %202) #4, !dbg !25
|
| 433 |
+
%382 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
|
| 434 |
+
%383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 1, i32 31), !dbg !25
|
| 435 |
+
%384 = add i32 %383, %382, !dbg !28
|
| 436 |
+
%385 = insertelement <1 x i32> poison, i32 %384, i64 0, !dbg !25
|
| 437 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %385, i1 %202) #4, !dbg !25
|
| 438 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 439 |
+
%386 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
|
| 440 |
+
%387 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
|
| 441 |
+
%388 = mul nuw nsw i32 %368, %.lobit2, !dbg !29
|
| 442 |
+
%389 = mul nuw nsw i32 %369, %.lobit2, !dbg !29
|
| 443 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 444 |
+
%390 = insertelement <1 x i32> poison, i32 %388, i64 0, !dbg !25
|
| 445 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %390, i1 true) #4, !dbg !25
|
| 446 |
+
%391 = insertelement <1 x i32> poison, i32 %389, i64 0, !dbg !25
|
| 447 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %391, i1 true) #4, !dbg !25
|
| 448 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 449 |
+
%392 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
|
| 450 |
+
%393 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %392, i32 1, i32 31), !dbg !25
|
| 451 |
+
%394 = add i32 %393, %392, !dbg !28
|
| 452 |
+
%395 = insertelement <1 x i32> poison, i32 %394, i64 0, !dbg !25
|
| 453 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %395, i1 %202) #4, !dbg !25
|
| 454 |
+
%396 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
|
| 455 |
+
%397 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %396, i32 1, i32 31), !dbg !25
|
| 456 |
+
%398 = add i32 %397, %396, !dbg !28
|
| 457 |
+
%399 = insertelement <1 x i32> poison, i32 %398, i64 0, !dbg !25
|
| 458 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %399, i1 %202) #4, !dbg !25
|
| 459 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 460 |
+
%400 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
|
| 461 |
+
%401 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
|
| 462 |
+
%402 = mul nuw nsw i32 %372, %40, !dbg !30
|
| 463 |
+
%403 = mul nuw nsw i32 %373, %40, !dbg !30
|
| 464 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 465 |
+
%404 = insertelement <1 x i32> poison, i32 %402, i64 0, !dbg !25
|
| 466 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %404, i1 true) #4, !dbg !25
|
| 467 |
+
%405 = insertelement <1 x i32> poison, i32 %403, i64 0, !dbg !25
|
| 468 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %405, i1 true) #4, !dbg !25
|
| 469 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 470 |
+
%406 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
|
| 471 |
+
%407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 1, i32 31), !dbg !25
|
| 472 |
+
%408 = add i32 %407, %406, !dbg !28
|
| 473 |
+
%409 = insertelement <1 x i32> poison, i32 %408, i64 0, !dbg !25
|
| 474 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %409, i1 %202) #4, !dbg !25
|
| 475 |
+
%410 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
|
| 476 |
+
%411 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %410, i32 1, i32 31), !dbg !25
|
| 477 |
+
%412 = add i32 %411, %410, !dbg !28
|
| 478 |
+
%413 = insertelement <1 x i32> poison, i32 %412, i64 0, !dbg !25
|
| 479 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %413, i1 %202) #4, !dbg !25
|
| 480 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 481 |
+
%414 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
|
| 482 |
+
%415 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
|
| 483 |
+
%416 = mul nuw nsw i32 %372, %.lobit2, !dbg !31
|
| 484 |
+
%417 = mul nuw nsw i32 %373, %.lobit2, !dbg !31
|
| 485 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 486 |
+
%418 = insertelement <1 x i32> poison, i32 %416, i64 0, !dbg !25
|
| 487 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %418, i1 true) #4, !dbg !25
|
| 488 |
+
%419 = insertelement <1 x i32> poison, i32 %417, i64 0, !dbg !25
|
| 489 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %419, i1 true) #4, !dbg !25
|
| 490 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 491 |
+
%420 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25
|
| 492 |
+
%421 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %420, i32 1, i32 31), !dbg !25
|
| 493 |
+
%422 = add i32 %421, %420, !dbg !28
|
| 494 |
+
%423 = insertelement <1 x i32> poison, i32 %422, i64 0, !dbg !25
|
| 495 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %423, i1 %202) #4, !dbg !25
|
| 496 |
+
%424 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25
|
| 497 |
+
%425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 1, i32 31), !dbg !25
|
| 498 |
+
%426 = add i32 %425, %424, !dbg !28
|
| 499 |
+
%427 = insertelement <1 x i32> poison, i32 %426, i64 0, !dbg !25
|
| 500 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %427, i1 %202) #4, !dbg !25
|
| 501 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25
|
| 502 |
+
%428 = load i32, ptr addrspace(3) %190, align 16, !dbg !25
|
| 503 |
+
%429 = load i32, ptr addrspace(3) %193, align 8, !dbg !25
|
| 504 |
+
%430 = icmp slt i32 %386, %400, !dbg !39
|
| 505 |
+
%431 = icmp slt i32 %387, %401, !dbg !39
|
| 506 |
+
%432 = icmp eq i32 %386, %400, !dbg !40
|
| 507 |
+
%433 = icmp eq i32 %387, %401, !dbg !40
|
| 508 |
+
%434 = icmp sgt i32 %414, %428, !dbg !41
|
| 509 |
+
%435 = icmp sgt i32 %415, %429, !dbg !41
|
| 510 |
+
%436 = and i1 %432, %434, !dbg !42
|
| 511 |
+
%437 = and i1 %433, %435, !dbg !42
|
| 512 |
+
%438 = or i1 %430, %436, !dbg !43
|
| 513 |
+
%439 = or i1 %431, %437, !dbg !43
|
| 514 |
+
%440 = xor i32 %400, %386, !dbg !33
|
| 515 |
+
%441 = xor i32 %401, %387, !dbg !33
|
| 516 |
+
%442 = select i1 %438, i32 %440, i32 0, !dbg !34
|
| 517 |
+
%443 = select i1 %439, i32 %441, i32 0, !dbg !34
|
| 518 |
+
%444 = xor i32 %442, %368, !dbg !35
|
| 519 |
+
%445 = xor i32 %443, %369, !dbg !35
|
| 520 |
+
%446 = xor i32 %428, %414, !dbg !36
|
| 521 |
+
%447 = xor i32 %429, %415, !dbg !36
|
| 522 |
+
%448 = select i1 %438, i32 %446, i32 0, !dbg !37
|
| 523 |
+
%449 = select i1 %439, i32 %447, i32 0, !dbg !37
|
| 524 |
+
%450 = xor i32 %448, %372, !dbg !38
|
| 525 |
+
%451 = xor i32 %449, %373, !dbg !38
|
| 526 |
+
%452 = mul nuw nsw i32 %444, %39, !dbg !24
|
| 527 |
+
%453 = mul nuw nsw i32 %445, %39, !dbg !24
|
| 528 |
+
%454 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 16, i32 31), !dbg !25
|
| 529 |
+
%455 = add i32 %452, %454, !dbg !28
|
| 530 |
+
%456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %453, i32 16, i32 31), !dbg !25
|
| 531 |
+
%457 = add i32 %453, %456, !dbg !28
|
| 532 |
+
%458 = mul nuw nsw i32 %444, %.lobit, !dbg !29
|
| 533 |
+
%459 = mul nuw nsw i32 %445, %.lobit, !dbg !29
|
| 534 |
+
%460 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %458, i32 16, i32 31), !dbg !25
|
| 535 |
+
%461 = add i32 %458, %460, !dbg !28
|
| 536 |
+
%462 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %459, i32 16, i32 31), !dbg !25
|
| 537 |
+
%463 = add i32 %459, %462, !dbg !28
|
| 538 |
+
%464 = mul nuw nsw i32 %450, %39, !dbg !30
|
| 539 |
+
%465 = mul nuw nsw i32 %451, %39, !dbg !30
|
| 540 |
+
%466 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 16, i32 31), !dbg !25
|
| 541 |
+
%467 = add i32 %464, %466, !dbg !28
|
| 542 |
+
%468 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %465, i32 16, i32 31), !dbg !25
|
| 543 |
+
%469 = add i32 %465, %468, !dbg !28
|
| 544 |
+
%470 = mul nuw nsw i32 %450, %.lobit, !dbg !31
|
| 545 |
+
%471 = mul nuw nsw i32 %451, %.lobit, !dbg !31
|
| 546 |
+
%472 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 16, i32 31), !dbg !25
|
| 547 |
+
%473 = add i32 %472, %470, !dbg !28
|
| 548 |
+
%474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %471, i32 16, i32 31), !dbg !25
|
| 549 |
+
%475 = add i32 %474, %471, !dbg !28
|
| 550 |
+
%476 = icmp slt i32 %455, %461, !dbg !39
|
| 551 |
+
%477 = icmp slt i32 %457, %463, !dbg !39
|
| 552 |
+
%478 = icmp eq i32 %455, %461, !dbg !40
|
| 553 |
+
%479 = icmp eq i32 %457, %463, !dbg !40
|
| 554 |
+
%480 = icmp sgt i32 %467, %473, !dbg !41
|
| 555 |
+
%481 = icmp sgt i32 %469, %475, !dbg !41
|
| 556 |
+
%482 = and i1 %478, %480, !dbg !42
|
| 557 |
+
%483 = and i1 %479, %481, !dbg !42
|
| 558 |
+
%484 = or i1 %476, %482, !dbg !43
|
| 559 |
+
%485 = or i1 %477, %483, !dbg !43
|
| 560 |
+
%486 = xor i32 %455, %461, !dbg !33
|
| 561 |
+
%487 = xor i32 %457, %463, !dbg !33
|
| 562 |
+
%488 = select i1 %484, i32 %486, i32 0, !dbg !34
|
| 563 |
+
%489 = select i1 %485, i32 %487, i32 0, !dbg !34
|
| 564 |
+
%490 = xor i32 %488, %444, !dbg !35
|
| 565 |
+
%491 = xor i32 %489, %445, !dbg !35
|
| 566 |
+
%492 = xor i32 %473, %467, !dbg !36
|
| 567 |
+
%493 = xor i32 %475, %469, !dbg !36
|
| 568 |
+
%494 = select i1 %484, i32 %492, i32 0, !dbg !37
|
| 569 |
+
%495 = select i1 %485, i32 %493, i32 0, !dbg !37
|
| 570 |
+
%496 = xor i32 %494, %450, !dbg !38
|
| 571 |
+
%497 = xor i32 %495, %451, !dbg !38
|
| 572 |
+
%498 = mul nuw nsw i32 %490, %38, !dbg !24
|
| 573 |
+
%499 = mul nuw nsw i32 %491, %38, !dbg !24
|
| 574 |
+
%500 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 8, i32 31), !dbg !25
|
| 575 |
+
%501 = add i32 %498, %500, !dbg !28
|
| 576 |
+
%502 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %499, i32 8, i32 31), !dbg !25
|
| 577 |
+
%503 = add i32 %499, %502, !dbg !28
|
| 578 |
+
%504 = mul nuw nsw i32 %490, %.lobit1, !dbg !29
|
| 579 |
+
%505 = mul nuw nsw i32 %491, %.lobit1, !dbg !29
|
| 580 |
+
%506 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %504, i32 8, i32 31), !dbg !25
|
| 581 |
+
%507 = add i32 %504, %506, !dbg !28
|
| 582 |
+
%508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %505, i32 8, i32 31), !dbg !25
|
| 583 |
+
%509 = add i32 %505, %508, !dbg !28
|
| 584 |
+
%510 = mul nuw nsw i32 %496, %38, !dbg !30
|
| 585 |
+
%511 = mul nuw nsw i32 %497, %38, !dbg !30
|
| 586 |
+
%512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %510, i32 8, i32 31), !dbg !25
|
| 587 |
+
%513 = add i32 %510, %512, !dbg !28
|
| 588 |
+
%514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 8, i32 31), !dbg !25
|
| 589 |
+
%515 = add i32 %511, %514, !dbg !28
|
| 590 |
+
%516 = mul nuw nsw i32 %496, %.lobit1, !dbg !31
|
| 591 |
+
%517 = mul nuw nsw i32 %497, %.lobit1, !dbg !31
|
| 592 |
+
%518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %516, i32 8, i32 31), !dbg !25
|
| 593 |
+
%519 = add i32 %518, %516, !dbg !28
|
| 594 |
+
%520 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %517, i32 8, i32 31), !dbg !25
|
| 595 |
+
%521 = add i32 %520, %517, !dbg !28
|
| 596 |
+
%522 = icmp slt i32 %501, %507, !dbg !39
|
| 597 |
+
%523 = icmp slt i32 %503, %509, !dbg !39
|
| 598 |
+
%524 = icmp eq i32 %501, %507, !dbg !40
|
| 599 |
+
%525 = icmp eq i32 %503, %509, !dbg !40
|
| 600 |
+
%526 = icmp sgt i32 %513, %519, !dbg !41
|
| 601 |
+
%527 = icmp sgt i32 %515, %521, !dbg !41
|
| 602 |
+
%528 = and i1 %524, %526, !dbg !42
|
| 603 |
+
%529 = and i1 %525, %527, !dbg !42
|
| 604 |
+
%530 = or i1 %522, %528, !dbg !43
|
| 605 |
+
%531 = or i1 %523, %529, !dbg !43
|
| 606 |
+
%532 = xor i32 %519, %513, !dbg !36
|
| 607 |
+
%533 = xor i32 %521, %515, !dbg !36
|
| 608 |
+
%534 = select i1 %530, i32 %532, i32 0, !dbg !37
|
| 609 |
+
%535 = select i1 %531, i32 %533, i32 0, !dbg !37
|
| 610 |
+
%536 = xor i32 %534, %496, !dbg !38
|
| 611 |
+
%537 = xor i32 %535, %497, !dbg !38
|
| 612 |
+
%narrow = select i1 %18, i32 %33, i32 0, !dbg !45
|
| 613 |
+
%538 = sext i32 %narrow to i64, !dbg !45
|
| 614 |
+
%narrow28 = select i1 %18, i32 %34, i32 0, !dbg !45
|
| 615 |
+
%539 = sext i32 %narrow28 to i64, !dbg !45
|
| 616 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46
|
| 617 |
+
%540 = add nsw i64 %539, %538, !dbg !48
|
| 618 |
+
%extelt.offset = lshr i64 %540, 32, !dbg !46
|
| 619 |
+
%541 = trunc nuw i64 %extelt.offset to i32, !dbg !46
|
| 620 |
+
%542 = trunc i64 %540 to i32, !dbg !46
|
| 621 |
+
%543 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %542, i32 16, i32 31), !dbg !46
|
| 622 |
+
%544 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 16, i32 31), !dbg !46
|
| 623 |
+
%545 = insertelement <2 x i32> poison, i32 %543, i64 0, !dbg !46
|
| 624 |
+
%546 = insertelement <2 x i32> %545, i32 %544, i64 1, !dbg !46
|
| 625 |
+
%547 = bitcast <2 x i32> %546 to i64, !dbg !46
|
| 626 |
+
%548 = add i64 %540, %547, !dbg !48
|
| 627 |
+
%extelt.offset29 = lshr i64 %548, 32, !dbg !46
|
| 628 |
+
%549 = trunc nuw i64 %extelt.offset29 to i32, !dbg !46
|
| 629 |
+
%550 = trunc i64 %548 to i32, !dbg !46
|
| 630 |
+
%551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %550, i32 8, i32 31), !dbg !46
|
| 631 |
+
%552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %549, i32 8, i32 31), !dbg !46
|
| 632 |
+
%553 = insertelement <2 x i32> poison, i32 %551, i64 0, !dbg !46
|
| 633 |
+
%554 = insertelement <2 x i32> %553, i32 %552, i64 1, !dbg !46
|
| 634 |
+
%555 = bitcast <2 x i32> %554 to i64, !dbg !46
|
| 635 |
+
%556 = add i64 %548, %555, !dbg !48
|
| 636 |
+
%557 = icmp eq i32 %11, 0, !dbg !46
|
| 637 |
+
%558 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %21, !dbg !46
|
| 638 |
+
%559 = getelementptr i64, ptr addrspace(3) %558, i32 %186, !dbg !46
|
| 639 |
+
%560 = insertelement <1 x i64> poison, i64 %556, i64 0, !dbg !46
|
| 640 |
+
tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %559, <1 x i64> %560, i1 %557) #4, !dbg !46
|
| 641 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46
|
| 642 |
+
%561 = icmp samesign ult i32 %10, 16, !dbg !46
|
| 643 |
+
%562 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %10, !dbg !46
|
| 644 |
+
%563 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %562, i1 %561) #4, !dbg !46
|
| 645 |
+
%extelt.offset30 = lshr i64 %563, 32, !dbg !46
|
| 646 |
+
%564 = trunc nuw i64 %extelt.offset30 to i32, !dbg !46
|
| 647 |
+
%565 = trunc i64 %563 to i32, !dbg !46
|
| 648 |
+
%566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 1, i32 31), !dbg !46
|
| 649 |
+
%567 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %564, i32 1, i32 31), !dbg !46
|
| 650 |
+
%568 = insertelement <2 x i32> poison, i32 %566, i64 0, !dbg !46
|
| 651 |
+
%569 = insertelement <2 x i32> %568, i32 %567, i64 1, !dbg !46
|
| 652 |
+
%570 = bitcast <2 x i32> %569 to i64, !dbg !46
|
| 653 |
+
%571 = add i64 %563, %570, !dbg !48
|
| 654 |
+
%572 = and i32 %10, 1009, !dbg !46
|
| 655 |
+
%573 = icmp eq i32 %572, 0, !dbg !46
|
| 656 |
+
%574 = insertelement <1 x i64> poison, i64 %571, i64 0, !dbg !46
|
| 657 |
+
tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %562, <1 x i64> %574, i1 %573) #4, !dbg !46
|
| 658 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46
|
| 659 |
+
%575 = load i64, ptr addrspace(3) %558, align 16, !dbg !46
|
| 660 |
+
%576 = trunc i64 %575 to i32, !dbg !49
|
| 661 |
+
%577 = shl i32 %17, 4, !dbg !50
|
| 662 |
+
%578 = or disjoint i32 %577, %21, !dbg !51
|
| 663 |
+
%579 = sext i32 %578 to i64, !dbg !52
|
| 664 |
+
%580 = getelementptr i32, ptr addrspace(1) %1, i64 %579, !dbg !52
|
| 665 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
|
| 666 |
+
%581 = and i32 %10, 3, !dbg !53
|
| 667 |
+
%582 = shl nuw nsw i32 %581, 3, !dbg !53
|
| 668 |
+
%583 = and i32 %187, 96, !dbg !53
|
| 669 |
+
%584 = and i32 %10, 4, !dbg !53
|
| 670 |
+
%585 = icmp eq i32 %584, 0, !dbg !53
|
| 671 |
+
%586 = select i1 %585, i32 0, i32 192, !dbg !53
|
| 672 |
+
%587 = select i1 %.not, i32 0, i32 260, !dbg !53
|
| 673 |
+
%588 = or disjoint i32 %582, %583, !dbg !53
|
| 674 |
+
%589 = xor i32 %588, %586, !dbg !53
|
| 675 |
+
%590 = or disjoint i32 %589, %587, !dbg !53
|
| 676 |
+
%591 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %590, !dbg !53
|
| 677 |
+
%592 = insertelement <1 x i32> poison, i32 %536, i64 0, !dbg !53
|
| 678 |
+
store <1 x i32> %592, ptr addrspace(3) %591, align 4, !dbg !53
|
| 679 |
+
%593 = xor i32 %590, 4, !dbg !53
|
| 680 |
+
%594 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %593, !dbg !53
|
| 681 |
+
%595 = insertelement <1 x i32> poison, i32 %537, i64 0, !dbg !53
|
| 682 |
+
store <1 x i32> %595, ptr addrspace(3) %594, align 4, !dbg !53
|
| 683 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
|
| 684 |
+
%596 = shl nuw nsw i32 %581, 5, !dbg !53
|
| 685 |
+
%597 = and i32 %10, 28, !dbg !53
|
| 686 |
+
%598 = select i1 %.not3, i32 0, i32 192, !dbg !53
|
| 687 |
+
%599 = or disjoint i32 %596, %597, !dbg !53
|
| 688 |
+
%600 = xor i32 %599, %598, !dbg !53
|
| 689 |
+
%601 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %600, !dbg !53
|
| 690 |
+
%602 = load i32, ptr addrspace(3) %601, align 4, !dbg !53
|
| 691 |
+
%603 = xor i32 %600, 260, !dbg !53
|
| 692 |
+
%604 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %603, !dbg !53
|
| 693 |
+
%605 = load i32, ptr addrspace(3) %604, align 4, !dbg !53
|
| 694 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %602, i32 %605, ptr addrspace(1) %580, i1 %19) #4, !dbg !53
|
| 695 |
+
%606 = sext i32 %16 to i64, !dbg !54
|
| 696 |
+
%607 = getelementptr i32, ptr addrspace(1) %2, i64 %606, !dbg !54
|
| 697 |
+
%608 = and i32 %10, 56, !dbg !55
|
| 698 |
+
%609 = icmp eq i32 %608, 0, !dbg !55
|
| 699 |
+
%610 = and i1 %609, %18, !dbg !55
|
| 700 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %576, ptr addrspace(1) %607, i1 %610) #4, !dbg !55
|
| 701 |
+
ret void, !dbg !56
|
| 702 |
+
}
|
| 703 |
+
|
| 704 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 705 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 706 |
+
|
| 707 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 708 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 709 |
+
|
| 710 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
| 711 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
| 712 |
+
|
| 713 |
+
; Function Attrs: convergent nocallback nounwind
|
| 714 |
+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
|
| 715 |
+
|
| 716 |
+
attributes #0 = { nounwind "nvvm.reqntid"="64" }
|
| 717 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 718 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
| 719 |
+
attributes #3 = { convergent nocallback nounwind }
|
| 720 |
+
attributes #4 = { nounwind }
|
| 721 |
+
|
| 722 |
+
!llvm.dbg.cu = !{!0}
|
| 723 |
+
!llvm.module.flags = !{!2, !3}
|
| 724 |
+
|
| 725 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 726 |
+
!1 = !DIFile(filename: "chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx")
|
| 727 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 728 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 729 |
+
!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 730 |
+
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
|
| 731 |
+
!6 = !{}
|
| 732 |
+
!7 = !DILocation(line: 24, column: 28, scope: !4)
|
| 733 |
+
!8 = !DILocation(line: 24, column: 33, scope: !4)
|
| 734 |
+
!9 = !DILocation(line: 25, column: 44, scope: !4)
|
| 735 |
+
!10 = !DILocation(line: 25, column: 23, scope: !4)
|
| 736 |
+
!11 = !DILocation(line: 26, column: 21, scope: !4)
|
| 737 |
+
!12 = !DILocation(line: 27, column: 38, scope: !4)
|
| 738 |
+
!13 = !DILocation(line: 34, column: 19, scope: !4)
|
| 739 |
+
!14 = !DILocation(line: 36, column: 38, scope: !4)
|
| 740 |
+
!15 = !DILocation(line: 36, column: 35, scope: !4)
|
| 741 |
+
!16 = !DILocation(line: 36, column: 45, scope: !4)
|
| 742 |
+
!17 = !DILocation(line: 36, column: 30, scope: !4)
|
| 743 |
+
!18 = !DILocation(line: 36, column: 54, scope: !4)
|
| 744 |
+
!19 = !DILocation(line: 627, column: 44, scope: !20, inlinedAt: !22)
|
| 745 |
+
!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0)
|
| 746 |
+
!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
|
| 747 |
+
!22 = !DILocation(line: 41, column: 67, scope: !4)
|
| 748 |
+
!23 = !DILocation(line: 537, column: 21, scope: !20, inlinedAt: !22)
|
| 749 |
+
!24 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !22)
|
| 750 |
+
!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !22)
|
| 751 |
+
!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0)
|
| 752 |
+
!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language")
|
| 753 |
+
!28 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !22)
|
| 754 |
+
!29 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !22)
|
| 755 |
+
!30 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !22)
|
| 756 |
+
!31 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !22)
|
| 757 |
+
!32 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !22)
|
| 758 |
+
!33 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !22)
|
| 759 |
+
!34 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !22)
|
| 760 |
+
!35 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !22)
|
| 761 |
+
!36 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !22)
|
| 762 |
+
!37 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !22)
|
| 763 |
+
!38 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !22)
|
| 764 |
+
!39 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !22)
|
| 765 |
+
!40 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !22)
|
| 766 |
+
!41 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !22)
|
| 767 |
+
!42 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !22)
|
| 768 |
+
!43 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !22)
|
| 769 |
+
!44 = !DILocation(line: 599, column: 19, scope: !20, inlinedAt: !22)
|
| 770 |
+
!45 = !DILocation(line: 44, column: 34, scope: !4)
|
| 771 |
+
!46 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !47)
|
| 772 |
+
!47 = !DILocation(line: 45, column: 26, scope: !4)
|
| 773 |
+
!48 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !47)
|
| 774 |
+
!49 = !DILocation(line: 48, column: 21, scope: !4)
|
| 775 |
+
!50 = !DILocation(line: 49, column: 35, scope: !4)
|
| 776 |
+
!51 = !DILocation(line: 49, column: 32, scope: !4)
|
| 777 |
+
!52 = !DILocation(line: 49, column: 25, scope: !4)
|
| 778 |
+
!53 = !DILocation(line: 49, column: 47, scope: !4)
|
| 779 |
+
!54 = !DILocation(line: 50, column: 25, scope: !4)
|
| 780 |
+
!55 = !DILocation(line: 50, column: 37, scope: !4)
|
| 781 |
+
!56 = !DILocation(line: 50, column: 4, scope: !4)
|
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx
ADDED
|
@@ -0,0 +1,1410 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3
|
| 10 |
+
.extern .shared .align 16 .b8 global_smem[];
|
| 11 |
+
// @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3
|
| 12 |
+
.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(
|
| 13 |
+
.param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0,
|
| 14 |
+
.param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1,
|
| 15 |
+
.param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2,
|
| 16 |
+
.param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3,
|
| 17 |
+
.param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4,
|
| 18 |
+
.param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5,
|
| 19 |
+
.param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6
|
| 20 |
+
)
|
| 21 |
+
.reqntid 64
|
| 22 |
+
{
|
| 23 |
+
.reg .pred %p<155>;
|
| 24 |
+
.reg .b32 %r<504>;
|
| 25 |
+
.reg .b64 %rd<27>;
|
| 26 |
+
.loc 1 18 0 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:18:0
|
| 27 |
+
$L__func_begin0:
|
| 28 |
+
.loc 1 18 0 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:18:0
|
| 29 |
+
|
| 30 |
+
// %bb.0:
|
| 31 |
+
ld.param.b64 %rd8, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0];
|
| 32 |
+
ld.param.b64 %rd9, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1];
|
| 33 |
+
$L__tmp0:
|
| 34 |
+
.loc 1 24 28 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:24:28
|
| 35 |
+
mov.u32 %r105, %ctaid.x;
|
| 36 |
+
.loc 1 24 33 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:24:33
|
| 37 |
+
shl.b32 %r106, %r105, 3;
|
| 38 |
+
ld.param.b64 %rd10, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2];
|
| 39 |
+
.loc 1 25 44 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:25:44
|
| 40 |
+
mov.u32 %r107, %tid.x;
|
| 41 |
+
and.b32 %r108, %r107, 24;
|
| 42 |
+
and.b32 %r109, %r107, 7;
|
| 43 |
+
bfe.u32 %r110, %r107, 3, 3;
|
| 44 |
+
.loc 1 25 23 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:25:23
|
| 45 |
+
or.b32 %r111, %r106, %r109;
|
| 46 |
+
or.b32 %r112, %r110, %r106;
|
| 47 |
+
.loc 1 26 21 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:26:21
|
| 48 |
+
setp.lt.s32 %p1, %r111, 32;
|
| 49 |
+
setp.lt.s32 %p54, %r112, 32;
|
| 50 |
+
.loc 1 27 38 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:27:38
|
| 51 |
+
or.b32 %r113, %r110, 8;
|
| 52 |
+
shl.b32 %r114, %r109, 1;
|
| 53 |
+
.loc 1 34 19 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:34:19
|
| 54 |
+
bfe.s32 %r115, %r105, 28, 1;
|
| 55 |
+
shr.u32 %r116, %r115, 28;
|
| 56 |
+
add.s32 %r117, %r111, %r116;
|
| 57 |
+
.loc 1 36 35 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:35
|
| 58 |
+
shl.b32 %r118, %r117, 4;
|
| 59 |
+
and.b32 %r119, %r118, -256;
|
| 60 |
+
add.s32 %r120, %r119, %r111;
|
| 61 |
+
.loc 1 36 45 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:45
|
| 62 |
+
mad.lo.s32 %r121, %r110, 17, %r120;
|
| 63 |
+
add.s32 %r122, %r121, 136;
|
| 64 |
+
.loc 1 36 30 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:30
|
| 65 |
+
mad.wide.s32 %rd1, %r121, 4, %rd8;
|
| 66 |
+
mad.wide.s32 %rd2, %r122, 4, %rd8;
|
| 67 |
+
.loc 1 36 54 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:54
|
| 68 |
+
// begin inline asm
|
| 69 |
+
mov.u32 %r1, 0x0;
|
| 70 |
+
@%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];
|
| 71 |
+
// end inline asm
|
| 72 |
+
// begin inline asm
|
| 73 |
+
mov.u32 %r2, 0x0;
|
| 74 |
+
@%p1 ld.global.b32 { %r2 }, [ %rd2 + 0 ];
|
| 75 |
+
// end inline asm
|
| 76 |
+
$L__tmp1:
|
| 77 |
+
.loc 2 627 44 // triton_helpers.py:627:44 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 78 |
+
shr.u32 %r123, %r107, 4;
|
| 79 |
+
bfe.u32 %r124, %r107, 4, 1;
|
| 80 |
+
bfe.s32 %r125, %r107, 3, 1;
|
| 81 |
+
and.b32 %r126, %r107, 8;
|
| 82 |
+
bfe.u32 %r127, %r107, 3, 1;
|
| 83 |
+
bfe.s32 %r128, %r107, 5, 1;
|
| 84 |
+
bfe.u32 %r129, %r107, 5, 1;
|
| 85 |
+
.loc 2 537 21 // triton_helpers.py:537:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 86 |
+
xor.b32 %r130, %r127, 1;
|
| 87 |
+
xor.b32 %r131, %r124, 1;
|
| 88 |
+
xor.b32 %r132, %r129, 1;
|
| 89 |
+
.loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 90 |
+
mul.lo.s32 %r133, %r1, %r130;
|
| 91 |
+
mul.lo.s32 %r134, %r2, %r130;
|
| 92 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 93 |
+
shfl.sync.bfly.b32 %r135, %r133, 8, 31, -1;
|
| 94 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 95 |
+
add.s32 %r136, %r135, %r133;
|
| 96 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 97 |
+
shfl.sync.bfly.b32 %r137, %r134, 8, 31, -1;
|
| 98 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 99 |
+
add.s32 %r138, %r137, %r134;
|
| 100 |
+
.loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 101 |
+
mul.lo.s32 %r139, %r1, %r127;
|
| 102 |
+
mul.lo.s32 %r140, %r2, %r127;
|
| 103 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 104 |
+
shfl.sync.bfly.b32 %r141, %r139, 8, 31, -1;
|
| 105 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 106 |
+
add.s32 %r142, %r141, %r139;
|
| 107 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 108 |
+
shfl.sync.bfly.b32 %r143, %r140, 8, 31, -1;
|
| 109 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 110 |
+
add.s32 %r144, %r143, %r140;
|
| 111 |
+
.loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 112 |
+
mul.lo.s32 %r145, %r130, %r110;
|
| 113 |
+
shl.b32 %r146, %r130, 3;
|
| 114 |
+
or.b32 %r147, %r145, %r146;
|
| 115 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 116 |
+
shfl.sync.bfly.b32 %r148, %r145, 8, 31, -1;
|
| 117 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 118 |
+
add.s32 %r149, %r148, %r145;
|
| 119 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 120 |
+
shfl.sync.bfly.b32 %r150, %r147, 8, 31, -1;
|
| 121 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 122 |
+
add.s32 %r151, %r150, %r147;
|
| 123 |
+
.loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 124 |
+
mul.lo.s32 %r152, %r110, %r127;
|
| 125 |
+
or.b32 %r153, %r152, %r126;
|
| 126 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 127 |
+
shfl.sync.bfly.b32 %r154, %r152, 8, 31, -1;
|
| 128 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 129 |
+
add.s32 %r155, %r154, %r152;
|
| 130 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 131 |
+
shfl.sync.bfly.b32 %r156, %r153, 8, 31, -1;
|
| 132 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 133 |
+
add.s32 %r157, %r156, %r153;
|
| 134 |
+
.loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 135 |
+
and.b32 %r158, %r123, 1;
|
| 136 |
+
setp.ne.b32 %p56, %r158, 0;
|
| 137 |
+
setp.ge.s32 %p57, %r136, %r142;
|
| 138 |
+
setp.ne.b32 %p58, %r136, %r142;
|
| 139 |
+
setp.le.s32 %p59, %r149, %r155;
|
| 140 |
+
or.pred %p60, %p58, %p59;
|
| 141 |
+
and.pred %p61, %p57, %p60;
|
| 142 |
+
xor.pred %p62, %p61, %p56;
|
| 143 |
+
setp.ge.s32 %p63, %r138, %r144;
|
| 144 |
+
setp.ne.b32 %p64, %r138, %r144;
|
| 145 |
+
setp.le.s32 %p65, %r151, %r157;
|
| 146 |
+
or.pred %p66, %p64, %p65;
|
| 147 |
+
and.pred %p67, %p63, %p66;
|
| 148 |
+
xor.pred %p68, %p67, %p56;
|
| 149 |
+
.loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 150 |
+
xor.b32 %r159, %r142, %r136;
|
| 151 |
+
xor.b32 %r160, %r144, %r138;
|
| 152 |
+
.loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 153 |
+
selp.b32 %r161, 0, %r159, %p62;
|
| 154 |
+
selp.b32 %r162, 0, %r160, %p68;
|
| 155 |
+
.loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 156 |
+
xor.b32 %r163, %r161, %r1;
|
| 157 |
+
xor.b32 %r164, %r162, %r2;
|
| 158 |
+
.loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 159 |
+
xor.b32 %r165, %r155, %r149;
|
| 160 |
+
xor.b32 %r166, %r157, %r151;
|
| 161 |
+
.loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 162 |
+
selp.b32 %r167, 0, %r165, %p62;
|
| 163 |
+
selp.b32 %r168, 0, %r166, %p68;
|
| 164 |
+
.loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 165 |
+
xor.b32 %r169, %r167, %r110;
|
| 166 |
+
xor.b32 %r170, %r168, %r113;
|
| 167 |
+
.loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 168 |
+
mul.lo.s32 %r171, %r163, %r131;
|
| 169 |
+
mul.lo.s32 %r172, %r164, %r131;
|
| 170 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 171 |
+
shfl.sync.bfly.b32 %r173, %r171, 16, 31, -1;
|
| 172 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 173 |
+
add.s32 %r174, %r171, %r173;
|
| 174 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 175 |
+
shfl.sync.bfly.b32 %r175, %r172, 16, 31, -1;
|
| 176 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 177 |
+
add.s32 %r176, %r172, %r175;
|
| 178 |
+
.loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 179 |
+
mul.lo.s32 %r177, %r163, %r124;
|
| 180 |
+
mul.lo.s32 %r178, %r164, %r124;
|
| 181 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 182 |
+
shfl.sync.bfly.b32 %r179, %r177, 16, 31, -1;
|
| 183 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 184 |
+
add.s32 %r180, %r177, %r179;
|
| 185 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 186 |
+
shfl.sync.bfly.b32 %r181, %r178, 16, 31, -1;
|
| 187 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 188 |
+
add.s32 %r182, %r178, %r181;
|
| 189 |
+
.loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 190 |
+
mul.lo.s32 %r183, %r169, %r131;
|
| 191 |
+
mul.lo.s32 %r184, %r170, %r131;
|
| 192 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 193 |
+
shfl.sync.bfly.b32 %r185, %r183, 16, 31, -1;
|
| 194 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 195 |
+
add.s32 %r186, %r183, %r185;
|
| 196 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 197 |
+
shfl.sync.bfly.b32 %r187, %r184, 16, 31, -1;
|
| 198 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 199 |
+
add.s32 %r188, %r184, %r187;
|
| 200 |
+
.loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 201 |
+
mul.lo.s32 %r189, %r169, %r124;
|
| 202 |
+
mul.lo.s32 %r190, %r170, %r124;
|
| 203 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 204 |
+
shfl.sync.bfly.b32 %r191, %r189, 16, 31, -1;
|
| 205 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 206 |
+
add.s32 %r192, %r189, %r191;
|
| 207 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 208 |
+
shfl.sync.bfly.b32 %r193, %r190, 16, 31, -1;
|
| 209 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 210 |
+
add.s32 %r194, %r190, %r193;
|
| 211 |
+
.loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 212 |
+
setp.lt.s32 %p69, %r174, %r180;
|
| 213 |
+
setp.lt.s32 %p70, %r176, %r182;
|
| 214 |
+
.loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 215 |
+
setp.eq.b32 %p71, %r174, %r180;
|
| 216 |
+
setp.eq.b32 %p72, %r176, %r182;
|
| 217 |
+
.loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 218 |
+
setp.gt.s32 %p73, %r186, %r192;
|
| 219 |
+
setp.gt.s32 %p74, %r188, %r194;
|
| 220 |
+
.loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 221 |
+
and.pred %p75, %p71, %p73;
|
| 222 |
+
and.pred %p76, %p72, %p74;
|
| 223 |
+
.loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 224 |
+
or.pred %p77, %p69, %p75;
|
| 225 |
+
or.pred %p78, %p70, %p76;
|
| 226 |
+
.loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 227 |
+
selp.b32 %r195, 1, 0, %p77;
|
| 228 |
+
selp.b32 %r196, 1, 0, %p78;
|
| 229 |
+
.loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 230 |
+
setp.eq.b32 %p79, %r129, %r195;
|
| 231 |
+
setp.eq.b32 %p80, %r129, %r196;
|
| 232 |
+
.loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 233 |
+
xor.b32 %r197, %r174, %r180;
|
| 234 |
+
xor.b32 %r198, %r176, %r182;
|
| 235 |
+
.loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 236 |
+
selp.b32 %r199, 0, %r197, %p79;
|
| 237 |
+
selp.b32 %r200, 0, %r198, %p80;
|
| 238 |
+
.loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 239 |
+
xor.b32 %r201, %r199, %r163;
|
| 240 |
+
xor.b32 %r202, %r200, %r164;
|
| 241 |
+
.loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 242 |
+
xor.b32 %r203, %r186, %r192;
|
| 243 |
+
xor.b32 %r204, %r188, %r194;
|
| 244 |
+
.loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 245 |
+
selp.b32 %r205, 0, %r203, %p79;
|
| 246 |
+
selp.b32 %r206, 0, %r204, %p80;
|
| 247 |
+
.loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 248 |
+
xor.b32 %r207, %r205, %r169;
|
| 249 |
+
xor.b32 %r208, %r206, %r170;
|
| 250 |
+
.loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 251 |
+
mul.lo.s32 %r209, %r201, %r130;
|
| 252 |
+
mul.lo.s32 %r210, %r202, %r130;
|
| 253 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 254 |
+
shfl.sync.bfly.b32 %r211, %r209, 8, 31, -1;
|
| 255 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 256 |
+
add.s32 %r212, %r209, %r211;
|
| 257 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 258 |
+
shfl.sync.bfly.b32 %r213, %r210, 8, 31, -1;
|
| 259 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 260 |
+
add.s32 %r214, %r210, %r213;
|
| 261 |
+
.loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 262 |
+
mul.lo.s32 %r215, %r201, %r127;
|
| 263 |
+
mul.lo.s32 %r216, %r202, %r127;
|
| 264 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 265 |
+
shfl.sync.bfly.b32 %r217, %r215, 8, 31, -1;
|
| 266 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 267 |
+
add.s32 %r218, %r215, %r217;
|
| 268 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 269 |
+
shfl.sync.bfly.b32 %r219, %r216, 8, 31, -1;
|
| 270 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 271 |
+
add.s32 %r220, %r216, %r219;
|
| 272 |
+
.loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 273 |
+
mul.lo.s32 %r221, %r207, %r130;
|
| 274 |
+
mul.lo.s32 %r222, %r208, %r130;
|
| 275 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 276 |
+
shfl.sync.bfly.b32 %r223, %r221, 8, 31, -1;
|
| 277 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 278 |
+
add.s32 %r224, %r221, %r223;
|
| 279 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 280 |
+
shfl.sync.bfly.b32 %r225, %r222, 8, 31, -1;
|
| 281 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 282 |
+
add.s32 %r226, %r222, %r225;
|
| 283 |
+
.loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 284 |
+
mul.lo.s32 %r227, %r207, %r127;
|
| 285 |
+
mul.lo.s32 %r228, %r208, %r127;
|
| 286 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 287 |
+
shfl.sync.bfly.b32 %r229, %r227, 8, 31, -1;
|
| 288 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 289 |
+
add.s32 %r230, %r227, %r229;
|
| 290 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 291 |
+
shfl.sync.bfly.b32 %r231, %r228, 8, 31, -1;
|
| 292 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 293 |
+
add.s32 %r232, %r228, %r231;
|
| 294 |
+
.loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 295 |
+
setp.lt.s32 %p81, %r212, %r218;
|
| 296 |
+
setp.lt.s32 %p82, %r214, %r220;
|
| 297 |
+
.loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 298 |
+
setp.eq.b32 %p83, %r212, %r218;
|
| 299 |
+
setp.eq.b32 %p84, %r214, %r220;
|
| 300 |
+
.loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 301 |
+
setp.gt.s32 %p85, %r224, %r230;
|
| 302 |
+
setp.gt.s32 %p86, %r226, %r232;
|
| 303 |
+
.loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 304 |
+
and.pred %p87, %p83, %p85;
|
| 305 |
+
and.pred %p88, %p84, %p86;
|
| 306 |
+
.loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 307 |
+
or.pred %p89, %p81, %p87;
|
| 308 |
+
or.pred %p90, %p82, %p88;
|
| 309 |
+
.loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 310 |
+
selp.b32 %r233, 1, 0, %p89;
|
| 311 |
+
selp.b32 %r234, 1, 0, %p90;
|
| 312 |
+
.loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 313 |
+
setp.eq.b32 %p91, %r129, %r233;
|
| 314 |
+
setp.eq.b32 %p92, %r129, %r234;
|
| 315 |
+
.loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 316 |
+
xor.b32 %r235, %r212, %r218;
|
| 317 |
+
xor.b32 %r236, %r214, %r220;
|
| 318 |
+
.loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 319 |
+
selp.b32 %r237, 0, %r235, %p91;
|
| 320 |
+
selp.b32 %r238, 0, %r236, %p92;
|
| 321 |
+
.loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 322 |
+
xor.b32 %r239, %r237, %r201;
|
| 323 |
+
xor.b32 %r240, %r238, %r202;
|
| 324 |
+
.loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 325 |
+
xor.b32 %r241, %r224, %r230;
|
| 326 |
+
xor.b32 %r242, %r226, %r232;
|
| 327 |
+
.loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 328 |
+
selp.b32 %r243, 0, %r241, %p91;
|
| 329 |
+
selp.b32 %r244, 0, %r242, %p92;
|
| 330 |
+
.loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 331 |
+
xor.b32 %r245, %r243, %r207;
|
| 332 |
+
xor.b32 %r246, %r244, %r208;
|
| 333 |
+
.loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 334 |
+
mul.lo.s32 %r4, %r239, %r132;
|
| 335 |
+
mul.lo.s32 %r6, %r240, %r132;
|
| 336 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 337 |
+
shl.b32 %r247, %r107, 1;
|
| 338 |
+
and.b32 %r248, %r247, 48;
|
| 339 |
+
or.b32 %r249, %r248, %r114;
|
| 340 |
+
shl.b32 %r250, %r249, 3;
|
| 341 |
+
mov.b32 %r251, global_smem;
|
| 342 |
+
add.s32 %r252, %r251, %r250;
|
| 343 |
+
shl.b32 %r253, %r129, 2;
|
| 344 |
+
add.s32 %r3, %r252, %r253;
|
| 345 |
+
mov.pred %p3, -1;
|
| 346 |
+
// begin inline asm
|
| 347 |
+
@%p3 st.shared.b32 [ %r3 + 0 ], %r4;
|
| 348 |
+
// end inline asm
|
| 349 |
+
add.s32 %r5, %r3, 8;
|
| 350 |
+
// begin inline asm
|
| 351 |
+
@%p3 st.shared.b32 [ %r5 + 0 ], %r6;
|
| 352 |
+
// end inline asm
|
| 353 |
+
bar.sync 0;
|
| 354 |
+
setp.lt.u32 %p5, %r107, 128;
|
| 355 |
+
shl.b32 %r254, %r107, 2;
|
| 356 |
+
add.s32 %r8, %r251, %r254;
|
| 357 |
+
// begin inline asm
|
| 358 |
+
@%p5 ld.shared.b32 %r7, [ %r8 + 0 ];
|
| 359 |
+
// end inline asm
|
| 360 |
+
shfl.sync.bfly.b32 %r255, %r7, 1, 31, -1;
|
| 361 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 362 |
+
add.s32 %r10, %r255, %r7;
|
| 363 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 364 |
+
and.b32 %r256, %r107, 897;
|
| 365 |
+
setp.eq.b32 %p6, %r256, 0;
|
| 366 |
+
// begin inline asm
|
| 367 |
+
@%p6 st.shared.b32 [ %r8 + 0 ], %r10;
|
| 368 |
+
// end inline asm
|
| 369 |
+
add.s32 %r12, %r8, 256;
|
| 370 |
+
// begin inline asm
|
| 371 |
+
@%p5 ld.shared.b32 %r11, [ %r12 + 0 ];
|
| 372 |
+
// end inline asm
|
| 373 |
+
shfl.sync.bfly.b32 %r257, %r11, 1, 31, -1;
|
| 374 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 375 |
+
add.s32 %r14, %r257, %r11;
|
| 376 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 377 |
+
// begin inline asm
|
| 378 |
+
@%p6 st.shared.b32 [ %r12 + 0 ], %r14;
|
| 379 |
+
// end inline asm
|
| 380 |
+
bar.sync 0;
|
| 381 |
+
ld.shared.b32 %r258, [%r252];
|
| 382 |
+
ld.shared.b32 %r259, [%r252+8];
|
| 383 |
+
.loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 384 |
+
mul.lo.s32 %r16, %r239, %r129;
|
| 385 |
+
mul.lo.s32 %r18, %r240, %r129;
|
| 386 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 387 |
+
bar.sync 0;
|
| 388 |
+
// begin inline asm
|
| 389 |
+
@%p3 st.shared.b32 [ %r3 + 0 ], %r16;
|
| 390 |
+
// end inline asm
|
| 391 |
+
// begin inline asm
|
| 392 |
+
@%p3 st.shared.b32 [ %r5 + 0 ], %r18;
|
| 393 |
+
// end inline asm
|
| 394 |
+
bar.sync 0;
|
| 395 |
+
// begin inline asm
|
| 396 |
+
@%p5 ld.shared.b32 %r19, [ %r8 + 0 ];
|
| 397 |
+
// end inline asm
|
| 398 |
+
shfl.sync.bfly.b32 %r260, %r19, 1, 31, -1;
|
| 399 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 400 |
+
add.s32 %r22, %r260, %r19;
|
| 401 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 402 |
+
// begin inline asm
|
| 403 |
+
@%p6 st.shared.b32 [ %r8 + 0 ], %r22;
|
| 404 |
+
// end inline asm
|
| 405 |
+
// begin inline asm
|
| 406 |
+
@%p5 ld.shared.b32 %r23, [ %r12 + 0 ];
|
| 407 |
+
// end inline asm
|
| 408 |
+
shfl.sync.bfly.b32 %r261, %r23, 1, 31, -1;
|
| 409 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 410 |
+
add.s32 %r26, %r261, %r23;
|
| 411 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 412 |
+
// begin inline asm
|
| 413 |
+
@%p6 st.shared.b32 [ %r12 + 0 ], %r26;
|
| 414 |
+
// end inline asm
|
| 415 |
+
bar.sync 0;
|
| 416 |
+
ld.shared.b32 %r262, [%r252];
|
| 417 |
+
ld.shared.b32 %r263, [%r252+8];
|
| 418 |
+
.loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 419 |
+
mul.lo.s32 %r28, %r245, %r132;
|
| 420 |
+
mul.lo.s32 %r30, %r246, %r132;
|
| 421 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 422 |
+
bar.sync 0;
|
| 423 |
+
// begin inline asm
|
| 424 |
+
@%p3 st.shared.b32 [ %r3 + 0 ], %r28;
|
| 425 |
+
// end inline asm
|
| 426 |
+
// begin inline asm
|
| 427 |
+
@%p3 st.shared.b32 [ %r5 + 0 ], %r30;
|
| 428 |
+
// end inline asm
|
| 429 |
+
bar.sync 0;
|
| 430 |
+
// begin inline asm
|
| 431 |
+
@%p5 ld.shared.b32 %r31, [ %r8 + 0 ];
|
| 432 |
+
// end inline asm
|
| 433 |
+
shfl.sync.bfly.b32 %r264, %r31, 1, 31, -1;
|
| 434 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 435 |
+
add.s32 %r34, %r264, %r31;
|
| 436 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 437 |
+
// begin inline asm
|
| 438 |
+
@%p6 st.shared.b32 [ %r8 + 0 ], %r34;
|
| 439 |
+
// end inline asm
|
| 440 |
+
// begin inline asm
|
| 441 |
+
@%p5 ld.shared.b32 %r35, [ %r12 + 0 ];
|
| 442 |
+
// end inline asm
|
| 443 |
+
shfl.sync.bfly.b32 %r265, %r35, 1, 31, -1;
|
| 444 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 445 |
+
add.s32 %r38, %r265, %r35;
|
| 446 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 447 |
+
// begin inline asm
|
| 448 |
+
@%p6 st.shared.b32 [ %r12 + 0 ], %r38;
|
| 449 |
+
// end inline asm
|
| 450 |
+
bar.sync 0;
|
| 451 |
+
ld.shared.b32 %r266, [%r252];
|
| 452 |
+
ld.shared.b32 %r267, [%r252+8];
|
| 453 |
+
.loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 454 |
+
mul.lo.s32 %r40, %r245, %r129;
|
| 455 |
+
mul.lo.s32 %r42, %r246, %r129;
|
| 456 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 457 |
+
bar.sync 0;
|
| 458 |
+
// begin inline asm
|
| 459 |
+
@%p3 st.shared.b32 [ %r3 + 0 ], %r40;
|
| 460 |
+
// end inline asm
|
| 461 |
+
// begin inline asm
|
| 462 |
+
@%p3 st.shared.b32 [ %r5 + 0 ], %r42;
|
| 463 |
+
// end inline asm
|
| 464 |
+
bar.sync 0;
|
| 465 |
+
// begin inline asm
|
| 466 |
+
@%p5 ld.shared.b32 %r43, [ %r8 + 0 ];
|
| 467 |
+
// end inline asm
|
| 468 |
+
shfl.sync.bfly.b32 %r268, %r43, 1, 31, -1;
|
| 469 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 470 |
+
add.s32 %r46, %r268, %r43;
|
| 471 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 472 |
+
// begin inline asm
|
| 473 |
+
@%p6 st.shared.b32 [ %r8 + 0 ], %r46;
|
| 474 |
+
// end inline asm
|
| 475 |
+
// begin inline asm
|
| 476 |
+
@%p5 ld.shared.b32 %r47, [ %r12 + 0 ];
|
| 477 |
+
// end inline asm
|
| 478 |
+
shfl.sync.bfly.b32 %r269, %r47, 1, 31, -1;
|
| 479 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 480 |
+
add.s32 %r50, %r269, %r47;
|
| 481 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 482 |
+
// begin inline asm
|
| 483 |
+
@%p6 st.shared.b32 [ %r12 + 0 ], %r50;
|
| 484 |
+
// end inline asm
|
| 485 |
+
bar.sync 0;
|
| 486 |
+
ld.shared.b32 %r270, [%r252];
|
| 487 |
+
ld.shared.b32 %r271, [%r252+8];
|
| 488 |
+
.loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 489 |
+
setp.lt.s32 %p93, %r258, %r262;
|
| 490 |
+
setp.ge.s32 %p94, %r259, %r263;
|
| 491 |
+
.loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 492 |
+
setp.eq.b32 %p95, %r258, %r262;
|
| 493 |
+
setp.ne.b32 %p96, %r259, %r263;
|
| 494 |
+
.loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 495 |
+
setp.gt.s32 %p97, %r266, %r270;
|
| 496 |
+
setp.le.s32 %p98, %r267, %r271;
|
| 497 |
+
.loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 498 |
+
and.pred %p99, %p95, %p97;
|
| 499 |
+
.loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 500 |
+
or.pred %p100, %p96, %p98;
|
| 501 |
+
or.pred %p101, %p93, %p99;
|
| 502 |
+
.loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 503 |
+
and.pred %p102, %p94, %p100;
|
| 504 |
+
.loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 505 |
+
xor.b32 %r272, %r262, %r258;
|
| 506 |
+
xor.b32 %r273, %r263, %r259;
|
| 507 |
+
.loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 508 |
+
selp.b32 %r274, %r272, 0, %p101;
|
| 509 |
+
selp.b32 %r275, %r273, 0, %p102;
|
| 510 |
+
.loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 511 |
+
xor.b32 %r276, %r274, %r239;
|
| 512 |
+
xor.b32 %r277, %r275, %r240;
|
| 513 |
+
.loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 514 |
+
xor.b32 %r278, %r270, %r266;
|
| 515 |
+
xor.b32 %r279, %r271, %r267;
|
| 516 |
+
.loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 517 |
+
selp.b32 %r280, %r278, 0, %p101;
|
| 518 |
+
selp.b32 %r281, %r279, 0, %p102;
|
| 519 |
+
.loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 520 |
+
xor.b32 %r282, %r280, %r245;
|
| 521 |
+
xor.b32 %r283, %r281, %r246;
|
| 522 |
+
.loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 523 |
+
mul.lo.s32 %r284, %r276, %r131;
|
| 524 |
+
mul.lo.s32 %r285, %r277, %r131;
|
| 525 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 526 |
+
shfl.sync.bfly.b32 %r286, %r284, 16, 31, -1;
|
| 527 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 528 |
+
add.s32 %r287, %r284, %r286;
|
| 529 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 530 |
+
shfl.sync.bfly.b32 %r288, %r285, 16, 31, -1;
|
| 531 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 532 |
+
add.s32 %r289, %r285, %r288;
|
| 533 |
+
.loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 534 |
+
mul.lo.s32 %r290, %r276, %r124;
|
| 535 |
+
mul.lo.s32 %r291, %r277, %r124;
|
| 536 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 537 |
+
shfl.sync.bfly.b32 %r292, %r290, 16, 31, -1;
|
| 538 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 539 |
+
add.s32 %r293, %r290, %r292;
|
| 540 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 541 |
+
shfl.sync.bfly.b32 %r294, %r291, 16, 31, -1;
|
| 542 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 543 |
+
add.s32 %r295, %r291, %r294;
|
| 544 |
+
.loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 545 |
+
mul.lo.s32 %r296, %r282, %r131;
|
| 546 |
+
mul.lo.s32 %r297, %r283, %r131;
|
| 547 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 548 |
+
shfl.sync.bfly.b32 %r298, %r296, 16, 31, -1;
|
| 549 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 550 |
+
add.s32 %r299, %r296, %r298;
|
| 551 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 552 |
+
shfl.sync.bfly.b32 %r300, %r297, 16, 31, -1;
|
| 553 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 554 |
+
add.s32 %r301, %r297, %r300;
|
| 555 |
+
.loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 556 |
+
mul.lo.s32 %r302, %r282, %r124;
|
| 557 |
+
mul.lo.s32 %r303, %r283, %r124;
|
| 558 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 559 |
+
shfl.sync.bfly.b32 %r304, %r302, 16, 31, -1;
|
| 560 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 561 |
+
add.s32 %r305, %r304, %r302;
|
| 562 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 563 |
+
shfl.sync.bfly.b32 %r306, %r303, 16, 31, -1;
|
| 564 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 565 |
+
add.s32 %r307, %r306, %r303;
|
| 566 |
+
.loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 567 |
+
setp.lt.s32 %p103, %r287, %r293;
|
| 568 |
+
setp.ge.s32 %p104, %r289, %r295;
|
| 569 |
+
.loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 570 |
+
setp.eq.b32 %p105, %r287, %r293;
|
| 571 |
+
setp.ne.b32 %p106, %r289, %r295;
|
| 572 |
+
.loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 573 |
+
setp.gt.s32 %p107, %r299, %r305;
|
| 574 |
+
setp.le.s32 %p108, %r301, %r307;
|
| 575 |
+
.loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 576 |
+
and.pred %p109, %p105, %p107;
|
| 577 |
+
.loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 578 |
+
or.pred %p110, %p106, %p108;
|
| 579 |
+
or.pred %p111, %p103, %p109;
|
| 580 |
+
.loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 581 |
+
and.pred %p112, %p104, %p110;
|
| 582 |
+
.loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 583 |
+
xor.b32 %r308, %r287, %r293;
|
| 584 |
+
xor.b32 %r309, %r289, %r295;
|
| 585 |
+
.loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 586 |
+
selp.b32 %r310, %r308, 0, %p111;
|
| 587 |
+
selp.b32 %r311, %r309, 0, %p112;
|
| 588 |
+
.loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 589 |
+
xor.b32 %r312, %r310, %r276;
|
| 590 |
+
xor.b32 %r313, %r311, %r277;
|
| 591 |
+
.loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 592 |
+
xor.b32 %r314, %r305, %r299;
|
| 593 |
+
xor.b32 %r315, %r307, %r301;
|
| 594 |
+
.loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 595 |
+
selp.b32 %r316, %r314, 0, %p111;
|
| 596 |
+
selp.b32 %r317, %r315, 0, %p112;
|
| 597 |
+
.loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 598 |
+
xor.b32 %r318, %r316, %r282;
|
| 599 |
+
xor.b32 %r319, %r317, %r283;
|
| 600 |
+
.loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 601 |
+
mul.lo.s32 %r320, %r312, %r130;
|
| 602 |
+
mul.lo.s32 %r321, %r313, %r130;
|
| 603 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 604 |
+
shfl.sync.bfly.b32 %r322, %r320, 8, 31, -1;
|
| 605 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 606 |
+
add.s32 %r323, %r320, %r322;
|
| 607 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 608 |
+
shfl.sync.bfly.b32 %r324, %r321, 8, 31, -1;
|
| 609 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 610 |
+
add.s32 %r325, %r321, %r324;
|
| 611 |
+
.loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 612 |
+
mul.lo.s32 %r326, %r312, %r127;
|
| 613 |
+
mul.lo.s32 %r327, %r313, %r127;
|
| 614 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 615 |
+
shfl.sync.bfly.b32 %r328, %r326, 8, 31, -1;
|
| 616 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 617 |
+
add.s32 %r329, %r326, %r328;
|
| 618 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 619 |
+
shfl.sync.bfly.b32 %r330, %r327, 8, 31, -1;
|
| 620 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 621 |
+
add.s32 %r331, %r327, %r330;
|
| 622 |
+
.loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 623 |
+
mul.lo.s32 %r332, %r318, %r130;
|
| 624 |
+
mul.lo.s32 %r333, %r319, %r130;
|
| 625 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 626 |
+
shfl.sync.bfly.b32 %r334, %r332, 8, 31, -1;
|
| 627 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 628 |
+
add.s32 %r335, %r332, %r334;
|
| 629 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 630 |
+
shfl.sync.bfly.b32 %r336, %r333, 8, 31, -1;
|
| 631 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 632 |
+
add.s32 %r337, %r333, %r336;
|
| 633 |
+
.loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 634 |
+
mul.lo.s32 %r338, %r318, %r127;
|
| 635 |
+
mul.lo.s32 %r339, %r319, %r127;
|
| 636 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 637 |
+
shfl.sync.bfly.b32 %r340, %r338, 8, 31, -1;
|
| 638 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 639 |
+
add.s32 %r341, %r340, %r338;
|
| 640 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 641 |
+
shfl.sync.bfly.b32 %r342, %r339, 8, 31, -1;
|
| 642 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 643 |
+
add.s32 %r343, %r342, %r339;
|
| 644 |
+
.loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 645 |
+
setp.lt.s32 %p113, %r323, %r329;
|
| 646 |
+
setp.ge.s32 %p114, %r325, %r331;
|
| 647 |
+
.loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 648 |
+
setp.eq.b32 %p115, %r323, %r329;
|
| 649 |
+
setp.ne.b32 %p116, %r325, %r331;
|
| 650 |
+
.loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 651 |
+
setp.gt.s32 %p117, %r335, %r341;
|
| 652 |
+
setp.le.s32 %p118, %r337, %r343;
|
| 653 |
+
.loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 654 |
+
and.pred %p119, %p115, %p117;
|
| 655 |
+
.loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 656 |
+
or.pred %p120, %p116, %p118;
|
| 657 |
+
or.pred %p121, %p113, %p119;
|
| 658 |
+
.loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 659 |
+
and.pred %p122, %p114, %p120;
|
| 660 |
+
.loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 661 |
+
xor.b32 %r344, %r323, %r329;
|
| 662 |
+
xor.b32 %r345, %r325, %r331;
|
| 663 |
+
.loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 664 |
+
selp.b32 %r346, %r344, 0, %p121;
|
| 665 |
+
selp.b32 %r347, %r345, 0, %p122;
|
| 666 |
+
.loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 667 |
+
xor.b32 %r348, %r346, %r312;
|
| 668 |
+
xor.b32 %r349, %r347, %r313;
|
| 669 |
+
.loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 670 |
+
xor.b32 %r350, %r341, %r335;
|
| 671 |
+
xor.b32 %r351, %r343, %r337;
|
| 672 |
+
.loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 673 |
+
selp.b32 %r352, %r350, 0, %p121;
|
| 674 |
+
selp.b32 %r353, %r351, 0, %p122;
|
| 675 |
+
.loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 676 |
+
xor.b32 %r354, %r352, %r318;
|
| 677 |
+
xor.b32 %r355, %r353, %r319;
|
| 678 |
+
.loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 679 |
+
setp.lt.s32 %p123, %r348, %r349;
|
| 680 |
+
.loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 681 |
+
setp.eq.b32 %p124, %r348, %r349;
|
| 682 |
+
.loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 683 |
+
setp.gt.s32 %p125, %r354, %r355;
|
| 684 |
+
.loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 685 |
+
and.pred %p126, %p124, %p125;
|
| 686 |
+
.loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 687 |
+
or.pred %p127, %p123, %p126;
|
| 688 |
+
.loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 689 |
+
xor.b32 %r356, %r349, %r348;
|
| 690 |
+
.loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 691 |
+
selp.b32 %r357, %r356, 0, %p127;
|
| 692 |
+
.loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 693 |
+
xor.b32 %r358, %r357, %r348;
|
| 694 |
+
xor.b32 %r359, %r357, %r349;
|
| 695 |
+
.loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 696 |
+
xor.b32 %r360, %r355, %r354;
|
| 697 |
+
.loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 698 |
+
selp.b32 %r361, %r360, 0, %p127;
|
| 699 |
+
.loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 700 |
+
xor.b32 %r362, %r361, %r354;
|
| 701 |
+
xor.b32 %r363, %r361, %r355;
|
| 702 |
+
.loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 703 |
+
mul.lo.s32 %r52, %r358, %r132;
|
| 704 |
+
mul.lo.s32 %r54, %r359, %r132;
|
| 705 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 706 |
+
bar.sync 0;
|
| 707 |
+
// begin inline asm
|
| 708 |
+
@%p3 st.shared.b32 [ %r3 + 0 ], %r52;
|
| 709 |
+
// end inline asm
|
| 710 |
+
// begin inline asm
|
| 711 |
+
@%p3 st.shared.b32 [ %r5 + 0 ], %r54;
|
| 712 |
+
// end inline asm
|
| 713 |
+
bar.sync 0;
|
| 714 |
+
// begin inline asm
|
| 715 |
+
@%p5 ld.shared.b32 %r55, [ %r8 + 0 ];
|
| 716 |
+
// end inline asm
|
| 717 |
+
shfl.sync.bfly.b32 %r364, %r55, 1, 31, -1;
|
| 718 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 719 |
+
add.s32 %r58, %r364, %r55;
|
| 720 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 721 |
+
// begin inline asm
|
| 722 |
+
@%p6 st.shared.b32 [ %r8 + 0 ], %r58;
|
| 723 |
+
// end inline asm
|
| 724 |
+
// begin inline asm
|
| 725 |
+
@%p5 ld.shared.b32 %r59, [ %r12 + 0 ];
|
| 726 |
+
// end inline asm
|
| 727 |
+
shfl.sync.bfly.b32 %r365, %r59, 1, 31, -1;
|
| 728 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 729 |
+
add.s32 %r62, %r365, %r59;
|
| 730 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 731 |
+
// begin inline asm
|
| 732 |
+
@%p6 st.shared.b32 [ %r12 + 0 ], %r62;
|
| 733 |
+
// end inline asm
|
| 734 |
+
bar.sync 0;
|
| 735 |
+
ld.shared.b32 %r366, [%r252];
|
| 736 |
+
ld.shared.b32 %r367, [%r252+8];
|
| 737 |
+
.loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 738 |
+
mul.lo.s32 %r64, %r358, %r129;
|
| 739 |
+
mul.lo.s32 %r66, %r359, %r129;
|
| 740 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 741 |
+
bar.sync 0;
|
| 742 |
+
// begin inline asm
|
| 743 |
+
@%p3 st.shared.b32 [ %r3 + 0 ], %r64;
|
| 744 |
+
// end inline asm
|
| 745 |
+
// begin inline asm
|
| 746 |
+
@%p3 st.shared.b32 [ %r5 + 0 ], %r66;
|
| 747 |
+
// end inline asm
|
| 748 |
+
bar.sync 0;
|
| 749 |
+
// begin inline asm
|
| 750 |
+
@%p5 ld.shared.b32 %r67, [ %r8 + 0 ];
|
| 751 |
+
// end inline asm
|
| 752 |
+
shfl.sync.bfly.b32 %r368, %r67, 1, 31, -1;
|
| 753 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 754 |
+
add.s32 %r70, %r368, %r67;
|
| 755 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 756 |
+
// begin inline asm
|
| 757 |
+
@%p6 st.shared.b32 [ %r8 + 0 ], %r70;
|
| 758 |
+
// end inline asm
|
| 759 |
+
// begin inline asm
|
| 760 |
+
@%p5 ld.shared.b32 %r71, [ %r12 + 0 ];
|
| 761 |
+
// end inline asm
|
| 762 |
+
shfl.sync.bfly.b32 %r369, %r71, 1, 31, -1;
|
| 763 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 764 |
+
add.s32 %r74, %r369, %r71;
|
| 765 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 766 |
+
// begin inline asm
|
| 767 |
+
@%p6 st.shared.b32 [ %r12 + 0 ], %r74;
|
| 768 |
+
// end inline asm
|
| 769 |
+
bar.sync 0;
|
| 770 |
+
ld.shared.b32 %r370, [%r252];
|
| 771 |
+
ld.shared.b32 %r371, [%r252+8];
|
| 772 |
+
.loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 773 |
+
mul.lo.s32 %r76, %r362, %r132;
|
| 774 |
+
mul.lo.s32 %r78, %r363, %r132;
|
| 775 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 776 |
+
bar.sync 0;
|
| 777 |
+
// begin inline asm
|
| 778 |
+
@%p3 st.shared.b32 [ %r3 + 0 ], %r76;
|
| 779 |
+
// end inline asm
|
| 780 |
+
// begin inline asm
|
| 781 |
+
@%p3 st.shared.b32 [ %r5 + 0 ], %r78;
|
| 782 |
+
// end inline asm
|
| 783 |
+
bar.sync 0;
|
| 784 |
+
// begin inline asm
|
| 785 |
+
@%p5 ld.shared.b32 %r79, [ %r8 + 0 ];
|
| 786 |
+
// end inline asm
|
| 787 |
+
shfl.sync.bfly.b32 %r372, %r79, 1, 31, -1;
|
| 788 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 789 |
+
add.s32 %r82, %r372, %r79;
|
| 790 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 791 |
+
// begin inline asm
|
| 792 |
+
@%p6 st.shared.b32 [ %r8 + 0 ], %r82;
|
| 793 |
+
// end inline asm
|
| 794 |
+
// begin inline asm
|
| 795 |
+
@%p5 ld.shared.b32 %r83, [ %r12 + 0 ];
|
| 796 |
+
// end inline asm
|
| 797 |
+
shfl.sync.bfly.b32 %r373, %r83, 1, 31, -1;
|
| 798 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 799 |
+
add.s32 %r86, %r373, %r83;
|
| 800 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 801 |
+
// begin inline asm
|
| 802 |
+
@%p6 st.shared.b32 [ %r12 + 0 ], %r86;
|
| 803 |
+
// end inline asm
|
| 804 |
+
bar.sync 0;
|
| 805 |
+
ld.shared.b32 %r374, [%r252];
|
| 806 |
+
ld.shared.b32 %r375, [%r252+8];
|
| 807 |
+
.loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 808 |
+
mul.lo.s32 %r88, %r362, %r129;
|
| 809 |
+
mul.lo.s32 %r90, %r363, %r129;
|
| 810 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 811 |
+
bar.sync 0;
|
| 812 |
+
// begin inline asm
|
| 813 |
+
@%p3 st.shared.b32 [ %r3 + 0 ], %r88;
|
| 814 |
+
// end inline asm
|
| 815 |
+
// begin inline asm
|
| 816 |
+
@%p3 st.shared.b32 [ %r5 + 0 ], %r90;
|
| 817 |
+
// end inline asm
|
| 818 |
+
bar.sync 0;
|
| 819 |
+
// begin inline asm
|
| 820 |
+
@%p5 ld.shared.b32 %r91, [ %r8 + 0 ];
|
| 821 |
+
// end inline asm
|
| 822 |
+
shfl.sync.bfly.b32 %r376, %r91, 1, 31, -1;
|
| 823 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 824 |
+
add.s32 %r94, %r376, %r91;
|
| 825 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 826 |
+
// begin inline asm
|
| 827 |
+
@%p6 st.shared.b32 [ %r8 + 0 ], %r94;
|
| 828 |
+
// end inline asm
|
| 829 |
+
// begin inline asm
|
| 830 |
+
@%p5 ld.shared.b32 %r95, [ %r12 + 0 ];
|
| 831 |
+
// end inline asm
|
| 832 |
+
shfl.sync.bfly.b32 %r377, %r95, 1, 31, -1;
|
| 833 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 834 |
+
add.s32 %r98, %r377, %r95;
|
| 835 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 836 |
+
// begin inline asm
|
| 837 |
+
@%p6 st.shared.b32 [ %r12 + 0 ], %r98;
|
| 838 |
+
// end inline asm
|
| 839 |
+
bar.sync 0;
|
| 840 |
+
ld.shared.b32 %r378, [%r252];
|
| 841 |
+
ld.shared.b32 %r379, [%r252+8];
|
| 842 |
+
.loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 843 |
+
setp.lt.s32 %p128, %r366, %r370;
|
| 844 |
+
setp.lt.s32 %p129, %r367, %r371;
|
| 845 |
+
.loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 846 |
+
setp.eq.b32 %p130, %r366, %r370;
|
| 847 |
+
setp.eq.b32 %p131, %r367, %r371;
|
| 848 |
+
.loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 849 |
+
setp.gt.s32 %p132, %r374, %r378;
|
| 850 |
+
setp.gt.s32 %p133, %r375, %r379;
|
| 851 |
+
.loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 852 |
+
and.pred %p134, %p130, %p132;
|
| 853 |
+
and.pred %p135, %p131, %p133;
|
| 854 |
+
.loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 855 |
+
or.pred %p136, %p128, %p134;
|
| 856 |
+
or.pred %p137, %p129, %p135;
|
| 857 |
+
.loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 858 |
+
xor.b32 %r380, %r370, %r366;
|
| 859 |
+
xor.b32 %r381, %r371, %r367;
|
| 860 |
+
.loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 861 |
+
selp.b32 %r382, %r380, 0, %p136;
|
| 862 |
+
selp.b32 %r383, %r381, 0, %p137;
|
| 863 |
+
.loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 864 |
+
xor.b32 %r384, %r382, %r358;
|
| 865 |
+
xor.b32 %r385, %r383, %r359;
|
| 866 |
+
.loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 867 |
+
xor.b32 %r386, %r378, %r374;
|
| 868 |
+
xor.b32 %r387, %r379, %r375;
|
| 869 |
+
.loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 870 |
+
selp.b32 %r388, %r386, 0, %p136;
|
| 871 |
+
selp.b32 %r389, %r387, 0, %p137;
|
| 872 |
+
.loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 873 |
+
xor.b32 %r390, %r388, %r362;
|
| 874 |
+
xor.b32 %r391, %r389, %r363;
|
| 875 |
+
.loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 876 |
+
mul.lo.s32 %r392, %r384, %r131;
|
| 877 |
+
mul.lo.s32 %r393, %r385, %r131;
|
| 878 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 879 |
+
shfl.sync.bfly.b32 %r394, %r392, 16, 31, -1;
|
| 880 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 881 |
+
add.s32 %r395, %r392, %r394;
|
| 882 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 883 |
+
shfl.sync.bfly.b32 %r396, %r393, 16, 31, -1;
|
| 884 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 885 |
+
add.s32 %r397, %r393, %r396;
|
| 886 |
+
.loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 887 |
+
mul.lo.s32 %r398, %r384, %r124;
|
| 888 |
+
mul.lo.s32 %r399, %r385, %r124;
|
| 889 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 890 |
+
shfl.sync.bfly.b32 %r400, %r398, 16, 31, -1;
|
| 891 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 892 |
+
add.s32 %r401, %r398, %r400;
|
| 893 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 894 |
+
shfl.sync.bfly.b32 %r402, %r399, 16, 31, -1;
|
| 895 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 896 |
+
add.s32 %r403, %r399, %r402;
|
| 897 |
+
.loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 898 |
+
mul.lo.s32 %r404, %r390, %r131;
|
| 899 |
+
mul.lo.s32 %r405, %r391, %r131;
|
| 900 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 901 |
+
shfl.sync.bfly.b32 %r406, %r404, 16, 31, -1;
|
| 902 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 903 |
+
add.s32 %r407, %r404, %r406;
|
| 904 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 905 |
+
shfl.sync.bfly.b32 %r408, %r405, 16, 31, -1;
|
| 906 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 907 |
+
add.s32 %r409, %r405, %r408;
|
| 908 |
+
.loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 909 |
+
mul.lo.s32 %r410, %r390, %r124;
|
| 910 |
+
mul.lo.s32 %r411, %r391, %r124;
|
| 911 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 912 |
+
shfl.sync.bfly.b32 %r412, %r410, 16, 31, -1;
|
| 913 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 914 |
+
add.s32 %r413, %r412, %r410;
|
| 915 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 916 |
+
shfl.sync.bfly.b32 %r414, %r411, 16, 31, -1;
|
| 917 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 918 |
+
add.s32 %r415, %r414, %r411;
|
| 919 |
+
.loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 920 |
+
setp.lt.s32 %p138, %r395, %r401;
|
| 921 |
+
setp.lt.s32 %p139, %r397, %r403;
|
| 922 |
+
.loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 923 |
+
setp.eq.b32 %p140, %r395, %r401;
|
| 924 |
+
setp.eq.b32 %p141, %r397, %r403;
|
| 925 |
+
.loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 926 |
+
setp.gt.s32 %p142, %r407, %r413;
|
| 927 |
+
setp.gt.s32 %p143, %r409, %r415;
|
| 928 |
+
.loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 929 |
+
and.pred %p144, %p140, %p142;
|
| 930 |
+
and.pred %p145, %p141, %p143;
|
| 931 |
+
.loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 932 |
+
or.pred %p146, %p138, %p144;
|
| 933 |
+
or.pred %p147, %p139, %p145;
|
| 934 |
+
.loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 935 |
+
xor.b32 %r416, %r395, %r401;
|
| 936 |
+
xor.b32 %r417, %r397, %r403;
|
| 937 |
+
.loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 938 |
+
selp.b32 %r418, %r416, 0, %p146;
|
| 939 |
+
selp.b32 %r419, %r417, 0, %p147;
|
| 940 |
+
.loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 941 |
+
xor.b32 %r420, %r418, %r384;
|
| 942 |
+
xor.b32 %r421, %r419, %r385;
|
| 943 |
+
.loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 944 |
+
xor.b32 %r422, %r413, %r407;
|
| 945 |
+
xor.b32 %r423, %r415, %r409;
|
| 946 |
+
.loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 947 |
+
selp.b32 %r424, %r422, 0, %p146;
|
| 948 |
+
selp.b32 %r425, %r423, 0, %p147;
|
| 949 |
+
.loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 950 |
+
xor.b32 %r426, %r424, %r390;
|
| 951 |
+
xor.b32 %r427, %r425, %r391;
|
| 952 |
+
.loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 953 |
+
mul.lo.s32 %r428, %r420, %r130;
|
| 954 |
+
mul.lo.s32 %r429, %r421, %r130;
|
| 955 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 956 |
+
shfl.sync.bfly.b32 %r430, %r428, 8, 31, -1;
|
| 957 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 958 |
+
add.s32 %r431, %r428, %r430;
|
| 959 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 960 |
+
shfl.sync.bfly.b32 %r432, %r429, 8, 31, -1;
|
| 961 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 962 |
+
add.s32 %r433, %r429, %r432;
|
| 963 |
+
.loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 964 |
+
mul.lo.s32 %r434, %r420, %r127;
|
| 965 |
+
mul.lo.s32 %r435, %r421, %r127;
|
| 966 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 967 |
+
shfl.sync.bfly.b32 %r436, %r434, 8, 31, -1;
|
| 968 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 969 |
+
add.s32 %r437, %r434, %r436;
|
| 970 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 971 |
+
shfl.sync.bfly.b32 %r438, %r435, 8, 31, -1;
|
| 972 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 973 |
+
add.s32 %r439, %r435, %r438;
|
| 974 |
+
.loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 975 |
+
mul.lo.s32 %r440, %r426, %r130;
|
| 976 |
+
mul.lo.s32 %r441, %r427, %r130;
|
| 977 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 978 |
+
shfl.sync.bfly.b32 %r442, %r440, 8, 31, -1;
|
| 979 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 980 |
+
add.s32 %r443, %r440, %r442;
|
| 981 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 982 |
+
shfl.sync.bfly.b32 %r444, %r441, 8, 31, -1;
|
| 983 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 984 |
+
add.s32 %r445, %r441, %r444;
|
| 985 |
+
.loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 986 |
+
mul.lo.s32 %r446, %r426, %r127;
|
| 987 |
+
mul.lo.s32 %r447, %r427, %r127;
|
| 988 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 989 |
+
shfl.sync.bfly.b32 %r448, %r446, 8, 31, -1;
|
| 990 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 991 |
+
add.s32 %r449, %r448, %r446;
|
| 992 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 993 |
+
shfl.sync.bfly.b32 %r450, %r447, 8, 31, -1;
|
| 994 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 995 |
+
add.s32 %r451, %r450, %r447;
|
| 996 |
+
.loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 997 |
+
setp.lt.s32 %p148, %r431, %r437;
|
| 998 |
+
setp.lt.s32 %p149, %r433, %r439;
|
| 999 |
+
.loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 1000 |
+
setp.eq.b32 %p150, %r431, %r437;
|
| 1001 |
+
setp.eq.b32 %p151, %r433, %r439;
|
| 1002 |
+
.loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 1003 |
+
setp.gt.s32 %p152, %r443, %r449;
|
| 1004 |
+
setp.gt.s32 %p153, %r445, %r451;
|
| 1005 |
+
.loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 1006 |
+
xor.b32 %r452, %r449, %r443;
|
| 1007 |
+
xor.b32 %r453, %r451, %r445;
|
| 1008 |
+
.loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 1009 |
+
selp.b32 %r454, %r452, 0, %p152;
|
| 1010 |
+
selp.b32 %r455, %r454, 0, %p150;
|
| 1011 |
+
selp.b32 %r456, %r452, %r455, %p148;
|
| 1012 |
+
selp.b32 %r457, %r453, 0, %p153;
|
| 1013 |
+
selp.b32 %r458, %r457, 0, %p151;
|
| 1014 |
+
selp.b32 %r459, %r453, %r458, %p149;
|
| 1015 |
+
.loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ]
|
| 1016 |
+
xor.b32 %r460, %r456, %r426;
|
| 1017 |
+
xor.b32 %r461, %r459, %r427;
|
| 1018 |
+
$L__tmp2:
|
| 1019 |
+
.loc 1 44 34 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:44:34
|
| 1020 |
+
selp.b32 %r462, %r1, 0, %p1;
|
| 1021 |
+
cvt.s64.s32 %rd11, %r462;
|
| 1022 |
+
selp.b32 %r463, %r2, 0, %p1;
|
| 1023 |
+
cvt.s64.s32 %rd12, %r463;
|
| 1024 |
+
$L__tmp3:
|
| 1025 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
|
| 1026 |
+
bar.sync 0;
|
| 1027 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
|
| 1028 |
+
add.s64 %rd13, %rd12, %rd11;
|
| 1029 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
|
| 1030 |
+
mov.b64 {_, %r464}, %rd13;
|
| 1031 |
+
cvt.u32.u64 %r465, %rd13;
|
| 1032 |
+
shfl.sync.bfly.b32 %r466, %r465, 16, 31, -1;
|
| 1033 |
+
shfl.sync.bfly.b32 %r467, %r464, 16, 31, -1;
|
| 1034 |
+
cvt.u64.u32 %rd14, %r466;
|
| 1035 |
+
cvt.u64.u32 %rd15, %r467;
|
| 1036 |
+
shl.b64 %rd16, %rd15, 32;
|
| 1037 |
+
or.b64 %rd17, %rd14, %rd16;
|
| 1038 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
|
| 1039 |
+
add.s64 %rd18, %rd13, %rd17;
|
| 1040 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
|
| 1041 |
+
mov.b64 {_, %r468}, %rd18;
|
| 1042 |
+
cvt.u32.u64 %r469, %rd18;
|
| 1043 |
+
shfl.sync.bfly.b32 %r470, %r469, 8, 31, -1;
|
| 1044 |
+
shfl.sync.bfly.b32 %r471, %r468, 8, 31, -1;
|
| 1045 |
+
cvt.u64.u32 %rd19, %r470;
|
| 1046 |
+
cvt.u64.u32 %rd20, %r471;
|
| 1047 |
+
shl.b64 %rd21, %rd20, 32;
|
| 1048 |
+
or.b64 %rd22, %rd19, %rd21;
|
| 1049 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
|
| 1050 |
+
add.s64 %rd3, %rd18, %rd22;
|
| 1051 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
|
| 1052 |
+
setp.eq.b32 %p51, %r108, 0;
|
| 1053 |
+
shl.b32 %r472, %r109, 4;
|
| 1054 |
+
add.s32 %r473, %r251, %r472;
|
| 1055 |
+
shl.b32 %r474, %r129, 3;
|
| 1056 |
+
add.s32 %r99, %r473, %r474;
|
| 1057 |
+
// begin inline asm
|
| 1058 |
+
@%p51 st.shared.b64 [ %r99 + 0 ], %rd3;
|
| 1059 |
+
// end inline asm
|
| 1060 |
+
bar.sync 0;
|
| 1061 |
+
setp.lt.u32 %p52, %r107, 16;
|
| 1062 |
+
shl.b32 %r475, %r107, 3;
|
| 1063 |
+
add.s32 %r100, %r251, %r475;
|
| 1064 |
+
// begin inline asm
|
| 1065 |
+
@%p52 ld.shared.b64 %rd4, [ %r100 + 0 ];
|
| 1066 |
+
// end inline asm
|
| 1067 |
+
mov.b64 {_, %r476}, %rd4;
|
| 1068 |
+
cvt.u32.u64 %r477, %rd4;
|
| 1069 |
+
shfl.sync.bfly.b32 %r478, %r477, 1, 31, -1;
|
| 1070 |
+
shfl.sync.bfly.b32 %r479, %r476, 1, 31, -1;
|
| 1071 |
+
cvt.u64.u32 %rd23, %r478;
|
| 1072 |
+
cvt.u64.u32 %rd24, %r479;
|
| 1073 |
+
shl.b64 %rd25, %rd24, 32;
|
| 1074 |
+
or.b64 %rd26, %rd23, %rd25;
|
| 1075 |
+
.loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
|
| 1076 |
+
add.s64 %rd5, %rd4, %rd26;
|
| 1077 |
+
.loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ]
|
| 1078 |
+
and.b32 %r480, %r107, 1009;
|
| 1079 |
+
setp.eq.b32 %p53, %r480, 0;
|
| 1080 |
+
// begin inline asm
|
| 1081 |
+
@%p53 st.shared.b64 [ %r100 + 0 ], %rd5;
|
| 1082 |
+
// end inline asm
|
| 1083 |
+
bar.sync 0;
|
| 1084 |
+
ld.shared.b32 %r104, [%r473];
|
| 1085 |
+
$L__tmp4:
|
| 1086 |
+
.loc 1 49 35 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:35
|
| 1087 |
+
shl.b32 %r481, %r112, 4;
|
| 1088 |
+
.loc 1 49 32 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:32
|
| 1089 |
+
or.b32 %r482, %r481, %r114;
|
| 1090 |
+
.loc 1 49 25 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:25
|
| 1091 |
+
mad.wide.s32 %rd6, %r482, 4, %rd9;
|
| 1092 |
+
.loc 1 49 47 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:47
|
| 1093 |
+
bar.sync 0;
|
| 1094 |
+
and.b32 %r483, %r107, 3;
|
| 1095 |
+
shl.b32 %r484, %r483, 3;
|
| 1096 |
+
and.b32 %r485, %r247, 96;
|
| 1097 |
+
bfe.s32 %r486, %r107, 2, 1;
|
| 1098 |
+
and.b32 %r487, %r486, 192;
|
| 1099 |
+
and.b32 %r488, %r125, 260;
|
| 1100 |
+
or.b32 %r489, %r484, %r485;
|
| 1101 |
+
xor.b32 %r490, %r489, %r487;
|
| 1102 |
+
or.b32 %r491, %r490, %r488;
|
| 1103 |
+
add.s32 %r492, %r251, %r491;
|
| 1104 |
+
st.shared.b32 [%r492], %r460;
|
| 1105 |
+
xor.b32 %r493, %r491, 4;
|
| 1106 |
+
add.s32 %r494, %r251, %r493;
|
| 1107 |
+
st.shared.b32 [%r494], %r461;
|
| 1108 |
+
bar.sync 0;
|
| 1109 |
+
shl.b32 %r495, %r483, 5;
|
| 1110 |
+
and.b32 %r496, %r107, 28;
|
| 1111 |
+
and.b32 %r497, %r128, 192;
|
| 1112 |
+
or.b32 %r498, %r495, %r496;
|
| 1113 |
+
xor.b32 %r499, %r498, %r497;
|
| 1114 |
+
add.s32 %r500, %r251, %r499;
|
| 1115 |
+
ld.shared.b32 %r102, [%r500];
|
| 1116 |
+
xor.b32 %r501, %r499, 4;
|
| 1117 |
+
add.s32 %r502, %r251, %r501;
|
| 1118 |
+
ld.shared.b32 %r103, [%r502+256];
|
| 1119 |
+
// begin inline asm
|
| 1120 |
+
@%p54 st.global.v2.b32 [ %rd6 + 0 ], { %r102, %r103 };
|
| 1121 |
+
// end inline asm
|
| 1122 |
+
.loc 1 50 25 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:25
|
| 1123 |
+
mad.wide.s32 %rd7, %r111, 4, %rd10;
|
| 1124 |
+
.loc 1 50 37 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:37
|
| 1125 |
+
and.b32 %r503, %r107, 56;
|
| 1126 |
+
setp.eq.b32 %p154, %r503, 0;
|
| 1127 |
+
and.pred %p55, %p154, %p1;
|
| 1128 |
+
// begin inline asm
|
| 1129 |
+
@%p55 st.global.b32 [ %rd7 + 0 ], { %r104 };
|
| 1130 |
+
// end inline asm
|
| 1131 |
+
.loc 1 50 4 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:4
|
| 1132 |
+
ret;
|
| 1133 |
+
$L__tmp5:
|
| 1134 |
+
$L__func_end0:
|
| 1135 |
+
// -- End function
|
| 1136 |
+
}
|
| 1137 |
+
.file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py"
|
| 1138 |
+
.file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
|
| 1139 |
+
.file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py"
|
| 1140 |
+
.section .debug_abbrev
|
| 1141 |
+
{
|
| 1142 |
+
.b8 1 // Abbreviation Code
|
| 1143 |
+
.b8 17 // DW_TAG_compile_unit
|
| 1144 |
+
.b8 1 // DW_CHILDREN_yes
|
| 1145 |
+
.b8 37 // DW_AT_producer
|
| 1146 |
+
.b8 8 // DW_FORM_string
|
| 1147 |
+
.b8 19 // DW_AT_language
|
| 1148 |
+
.b8 5 // DW_FORM_data2
|
| 1149 |
+
.b8 3 // DW_AT_name
|
| 1150 |
+
.b8 8 // DW_FORM_string
|
| 1151 |
+
.b8 16 // DW_AT_stmt_list
|
| 1152 |
+
.b8 6 // DW_FORM_data4
|
| 1153 |
+
.b8 27 // DW_AT_comp_dir
|
| 1154 |
+
.b8 8 // DW_FORM_string
|
| 1155 |
+
.b8 0 // EOM(1)
|
| 1156 |
+
.b8 0 // EOM(2)
|
| 1157 |
+
.b8 2 // Abbreviation Code
|
| 1158 |
+
.b8 46 // DW_TAG_subprogram
|
| 1159 |
+
.b8 0 // DW_CHILDREN_no
|
| 1160 |
+
.b8 3 // DW_AT_name
|
| 1161 |
+
.b8 8 // DW_FORM_string
|
| 1162 |
+
.b8 32 // DW_AT_inline
|
| 1163 |
+
.b8 11 // DW_FORM_data1
|
| 1164 |
+
.b8 0 // EOM(1)
|
| 1165 |
+
.b8 0 // EOM(2)
|
| 1166 |
+
.b8 3 // Abbreviation Code
|
| 1167 |
+
.b8 46 // DW_TAG_subprogram
|
| 1168 |
+
.b8 1 // DW_CHILDREN_yes
|
| 1169 |
+
.b8 17 // DW_AT_low_pc
|
| 1170 |
+
.b8 1 // DW_FORM_addr
|
| 1171 |
+
.b8 18 // DW_AT_high_pc
|
| 1172 |
+
.b8 1 // DW_FORM_addr
|
| 1173 |
+
.b8 49 // DW_AT_abstract_origin
|
| 1174 |
+
.b8 19 // DW_FORM_ref4
|
| 1175 |
+
.b8 0 // EOM(1)
|
| 1176 |
+
.b8 0 // EOM(2)
|
| 1177 |
+
.b8 4 // Abbreviation Code
|
| 1178 |
+
.b8 29 // DW_TAG_inlined_subroutine
|
| 1179 |
+
.b8 0 // DW_CHILDREN_no
|
| 1180 |
+
.b8 49 // DW_AT_abstract_origin
|
| 1181 |
+
.b8 19 // DW_FORM_ref4
|
| 1182 |
+
.b8 17 // DW_AT_low_pc
|
| 1183 |
+
.b8 1 // DW_FORM_addr
|
| 1184 |
+
.b8 18 // DW_AT_high_pc
|
| 1185 |
+
.b8 1 // DW_FORM_addr
|
| 1186 |
+
.b8 88 // DW_AT_call_file
|
| 1187 |
+
.b8 11 // DW_FORM_data1
|
| 1188 |
+
.b8 89 // DW_AT_call_line
|
| 1189 |
+
.b8 11 // DW_FORM_data1
|
| 1190 |
+
.b8 87 // DW_AT_call_column
|
| 1191 |
+
.b8 11 // DW_FORM_data1
|
| 1192 |
+
.b8 0 // EOM(1)
|
| 1193 |
+
.b8 0 // EOM(2)
|
| 1194 |
+
.b8 0 // EOM(3)
|
| 1195 |
+
}
|
| 1196 |
+
.section .debug_info
|
| 1197 |
+
{
|
| 1198 |
+
.b32 267 // Length of Unit
|
| 1199 |
+
.b8 2 // DWARF version number
|
| 1200 |
+
.b8 0
|
| 1201 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 1202 |
+
.b8 8 // Address Size (in bytes)
|
| 1203 |
+
.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit
|
| 1204 |
+
.b8 116 // DW_AT_producer
|
| 1205 |
+
.b8 114
|
| 1206 |
+
.b8 105
|
| 1207 |
+
.b8 116
|
| 1208 |
+
.b8 111
|
| 1209 |
+
.b8 110
|
| 1210 |
+
.b8 0
|
| 1211 |
+
.b8 2 // DW_AT_language
|
| 1212 |
+
.b8 0
|
| 1213 |
+
.b8 99 // DW_AT_name
|
| 1214 |
+
.b8 104
|
| 1215 |
+
.b8 120
|
| 1216 |
+
.b8 110
|
| 1217 |
+
.b8 121
|
| 1218 |
+
.b8 103
|
| 1219 |
+
.b8 112
|
| 1220 |
+
.b8 118
|
| 1221 |
+
.b8 112
|
| 1222 |
+
.b8 109
|
| 1223 |
+
.b8 118
|
| 1224 |
+
.b8 114
|
| 1225 |
+
.b8 50
|
| 1226 |
+
.b8 109
|
| 1227 |
+
.b8 120
|
| 1228 |
+
.b8 50
|
| 1229 |
+
.b8 101
|
| 1230 |
+
.b8 54
|
| 1231 |
+
.b8 109
|
| 1232 |
+
.b8 119
|
| 1233 |
+
.b8 103
|
| 1234 |
+
.b8 100
|
| 1235 |
+
.b8 101
|
| 1236 |
+
.b8 111
|
| 1237 |
+
.b8 106
|
| 1238 |
+
.b8 116
|
| 1239 |
+
.b8 104
|
| 1240 |
+
.b8 114
|
| 1241 |
+
.b8 105
|
| 1242 |
+
.b8 114
|
| 1243 |
+
.b8 110
|
| 1244 |
+
.b8 111
|
| 1245 |
+
.b8 103
|
| 1246 |
+
.b8 55
|
| 1247 |
+
.b8 110
|
| 1248 |
+
.b8 109
|
| 1249 |
+
.b8 113
|
| 1250 |
+
.b8 54
|
| 1251 |
+
.b8 109
|
| 1252 |
+
.b8 99
|
| 1253 |
+
.b8 115
|
| 1254 |
+
.b8 105
|
| 1255 |
+
.b8 51
|
| 1256 |
+
.b8 119
|
| 1257 |
+
.b8 118
|
| 1258 |
+
.b8 101
|
| 1259 |
+
.b8 103
|
| 1260 |
+
.b8 118
|
| 1261 |
+
.b8 105
|
| 1262 |
+
.b8 50
|
| 1263 |
+
.b8 115
|
| 1264 |
+
.b8 111
|
| 1265 |
+
.b8 46
|
| 1266 |
+
.b8 112
|
| 1267 |
+
.b8 121
|
| 1268 |
+
.b8 0
|
| 1269 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 1270 |
+
.b8 47 // DW_AT_comp_dir
|
| 1271 |
+
.b8 119
|
| 1272 |
+
.b8 111
|
| 1273 |
+
.b8 114
|
| 1274 |
+
.b8 107
|
| 1275 |
+
.b8 115
|
| 1276 |
+
.b8 112
|
| 1277 |
+
.b8 97
|
| 1278 |
+
.b8 99
|
| 1279 |
+
.b8 101
|
| 1280 |
+
.b8 47
|
| 1281 |
+
.b8 104
|
| 1282 |
+
.b8 97
|
| 1283 |
+
.b8 110
|
| 1284 |
+
.b8 114
|
| 1285 |
+
.b8 117
|
| 1286 |
+
.b8 105
|
| 1287 |
+
.b8 47
|
| 1288 |
+
.b8 83
|
| 1289 |
+
.b8 112
|
| 1290 |
+
.b8 101
|
| 1291 |
+
.b8 99
|
| 1292 |
+
.b8 70
|
| 1293 |
+
.b8 111
|
| 1294 |
+
.b8 114
|
| 1295 |
+
.b8 103
|
| 1296 |
+
.b8 101
|
| 1297 |
+
.b8 45
|
| 1298 |
+
.b8 101
|
| 1299 |
+
.b8 120
|
| 1300 |
+
.b8 116
|
| 1301 |
+
.b8 47
|
| 1302 |
+
.b8 99
|
| 1303 |
+
.b8 97
|
| 1304 |
+
.b8 99
|
| 1305 |
+
.b8 104
|
| 1306 |
+
.b8 101
|
| 1307 |
+
.b8 47
|
| 1308 |
+
.b8 99
|
| 1309 |
+
.b8 111
|
| 1310 |
+
.b8 109
|
| 1311 |
+
.b8 112
|
| 1312 |
+
.b8 105
|
| 1313 |
+
.b8 108
|
| 1314 |
+
.b8 101
|
| 1315 |
+
.b8 100
|
| 1316 |
+
.b8 95
|
| 1317 |
+
.b8 107
|
| 1318 |
+
.b8 101
|
| 1319 |
+
.b8 114
|
| 1320 |
+
.b8 110
|
| 1321 |
+
.b8 101
|
| 1322 |
+
.b8 108
|
| 1323 |
+
.b8 115
|
| 1324 |
+
.b8 47
|
| 1325 |
+
.b8 104
|
| 1326 |
+
.b8 120
|
| 1327 |
+
.b8 0
|
| 1328 |
+
.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram
|
| 1329 |
+
.b8 116 // DW_AT_name
|
| 1330 |
+
.b8 114
|
| 1331 |
+
.b8 105
|
| 1332 |
+
.b8 116
|
| 1333 |
+
.b8 111
|
| 1334 |
+
.b8 110
|
| 1335 |
+
.b8 95
|
| 1336 |
+
.b8 112
|
| 1337 |
+
.b8 101
|
| 1338 |
+
.b8 114
|
| 1339 |
+
.b8 95
|
| 1340 |
+
.b8 102
|
| 1341 |
+
.b8 117
|
| 1342 |
+
.b8 115
|
| 1343 |
+
.b8 101
|
| 1344 |
+
.b8 100
|
| 1345 |
+
.b8 95
|
| 1346 |
+
.b8 95
|
| 1347 |
+
.b8 116
|
| 1348 |
+
.b8 111
|
| 1349 |
+
.b8 95
|
| 1350 |
+
.b8 99
|
| 1351 |
+
.b8 111
|
| 1352 |
+
.b8 112
|
| 1353 |
+
.b8 121
|
| 1354 |
+
.b8 95
|
| 1355 |
+
.b8 99
|
| 1356 |
+
.b8 108
|
| 1357 |
+
.b8 111
|
| 1358 |
+
.b8 110
|
| 1359 |
+
.b8 101
|
| 1360 |
+
.b8 95
|
| 1361 |
+
.b8 115
|
| 1362 |
+
.b8 108
|
| 1363 |
+
.b8 105
|
| 1364 |
+
.b8 99
|
| 1365 |
+
.b8 101
|
| 1366 |
+
.b8 95
|
| 1367 |
+
.b8 115
|
| 1368 |
+
.b8 111
|
| 1369 |
+
.b8 114
|
| 1370 |
+
.b8 116
|
| 1371 |
+
.b8 95
|
| 1372 |
+
.b8 115
|
| 1373 |
+
.b8 117
|
| 1374 |
+
.b8 109
|
| 1375 |
+
.b8 95
|
| 1376 |
+
.b8 116
|
| 1377 |
+
.b8 114
|
| 1378 |
+
.b8 97
|
| 1379 |
+
.b8 110
|
| 1380 |
+
.b8 115
|
| 1381 |
+
.b8 112
|
| 1382 |
+
.b8 111
|
| 1383 |
+
.b8 115
|
| 1384 |
+
.b8 101
|
| 1385 |
+
.b8 95
|
| 1386 |
+
.b8 51
|
| 1387 |
+
.b8 0
|
| 1388 |
+
.b8 1 // DW_AT_inline
|
| 1389 |
+
.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram
|
| 1390 |
+
.b64 $L__func_begin0 // DW_AT_low_pc
|
| 1391 |
+
.b64 $L__func_end0 // DW_AT_high_pc
|
| 1392 |
+
.b32 139 // DW_AT_abstract_origin
|
| 1393 |
+
.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine
|
| 1394 |
+
.b32 139 // DW_AT_abstract_origin
|
| 1395 |
+
.b64 $L__tmp1 // DW_AT_low_pc
|
| 1396 |
+
.b64 $L__tmp2 // DW_AT_high_pc
|
| 1397 |
+
.b8 1 // DW_AT_call_file
|
| 1398 |
+
.b8 41 // DW_AT_call_line
|
| 1399 |
+
.b8 67 // DW_AT_call_column
|
| 1400 |
+
.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine
|
| 1401 |
+
.b32 139 // DW_AT_abstract_origin
|
| 1402 |
+
.b64 $L__tmp3 // DW_AT_low_pc
|
| 1403 |
+
.b64 $L__tmp4 // DW_AT_high_pc
|
| 1404 |
+
.b8 1 // DW_AT_call_file
|
| 1405 |
+
.b8 45 // DW_AT_call_line
|
| 1406 |
+
.b8 26 // DW_AT_call_column
|
| 1407 |
+
.b8 0 // End Of Children Mark
|
| 1408 |
+
.b8 0 // End Of Children Mark
|
| 1409 |
+
}
|
| 1410 |
+
.section .debug_macinfo { }
|
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir
ADDED
|
@@ -0,0 +1,841 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 2], order = [0, 1]}>
|
| 2 |
+
#blocked1 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [2, 1], order = [1, 0]}>
|
| 3 |
+
#linear = #ttg.linear<{register = [[0, 8]], lane = [[1, 0], [2, 0], [4, 0], [0, 1], [0, 2]], warp = [[0, 4]], block = []}>
|
| 4 |
+
#linear1 = #ttg.linear<{register = [[4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [0, 1, 0], [1, 0, 0]], warp = [[2, 0, 0]], block = []}>
|
| 5 |
+
#linear2 = #ttg.linear<{register = [[2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [0, 0, 1], [0, 1, 0]], warp = [[1, 0, 0]], block = []}>
|
| 6 |
+
#linear3 = #ttg.linear<{register = [[1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [0, 0, 1], [0, 0, 2]], warp = [[0, 1, 0]], block = []}>
|
| 7 |
+
#linear4 = #ttg.linear<{register = [[0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [0, 0, 1], [0, 0, 2]], warp = [[0, 0, 4]], block = []}>
|
| 8 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":18:0)
|
| 9 |
+
#loc1 = loc(unknown)
|
| 10 |
+
#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12)
|
| 11 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":41:67)
|
| 12 |
+
#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73)
|
| 13 |
+
#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51)
|
| 14 |
+
#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53)
|
| 15 |
+
#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50)
|
| 16 |
+
#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51)
|
| 17 |
+
#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:26)
|
| 18 |
+
#loc77 = loc("in_ptr0"(#loc))
|
| 19 |
+
#loc78 = loc("out_ptr2"(#loc))
|
| 20 |
+
#loc79 = loc("out_ptr3"(#loc))
|
| 21 |
+
#loc80 = loc("xnumel"(#loc))
|
| 22 |
+
#loc81 = loc("r0_numel"(#loc))
|
| 23 |
+
#loc99 = loc(callsite(#loc19 at #loc20))
|
| 24 |
+
#loc105 = loc("ileft"(#loc28))
|
| 25 |
+
#loc109 = loc("iright"(#loc33))
|
| 26 |
+
#loc118 = loc("left_idx"(#loc42))
|
| 27 |
+
#loc123 = loc("right_idx"(#loc47))
|
| 28 |
+
#loc143 = loc("tmp11"(#loc67))
|
| 29 |
+
#loc149 = loc(callsite(#loc24 at #loc99))
|
| 30 |
+
#loc153 = loc(callsite(#loc1 at #loc143))
|
| 31 |
+
#loc157 = loc(callsite(#loc105 at #loc149))
|
| 32 |
+
#loc161 = loc(callsite(#loc109 at #loc149))
|
| 33 |
+
#loc169 = loc(callsite(#loc118 at #loc149))
|
| 34 |
+
#loc174 = loc(callsite(#loc123 at #loc149))
|
| 35 |
+
#loc194 = loc(callsite(#loc1 at #loc157))
|
| 36 |
+
#loc196 = loc(callsite(#loc1 at #loc161))
|
| 37 |
+
#loc199 = loc(callsite(#loc1 at #loc169))
|
| 38 |
+
#loc202 = loc(callsite(#loc1 at #loc174))
|
| 39 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 40 |
+
tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 41 |
+
%cst = arith.constant dense<0> : tensor<8x16xi32, #linear> loc(#loc1)
|
| 42 |
+
%cst_0 = arith.constant dense<0> : tensor<8x16xi64, #blocked> loc(#loc1)
|
| 43 |
+
%c8_i32 = arith.constant 8 : i32 loc(#loc1)
|
| 44 |
+
%cst_1 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1)
|
| 45 |
+
%cst_2 = arith.constant dense<32> : tensor<8x1xi32, #blocked1> loc(#loc1)
|
| 46 |
+
%cst_3 = arith.constant dense<16> : tensor<8x1xi32, #blocked> loc(#loc1)
|
| 47 |
+
%cst_4 = arith.constant dense<16> : tensor<8x1xi32, #blocked1> loc(#loc1)
|
| 48 |
+
%cst_5 = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1)
|
| 49 |
+
%cst_6 = arith.constant dense<272> : tensor<8x1xi32, #blocked> loc(#loc1)
|
| 50 |
+
%cst_7 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1)
|
| 51 |
+
%cst_8 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1)
|
| 52 |
+
%cst_9 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1)
|
| 53 |
+
%cst_10 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1)
|
| 54 |
+
%cst_11 = arith.constant dense<0> : tensor<8x16xi32, #blocked> loc(#loc1)
|
| 55 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc82)
|
| 56 |
+
%xoffset_12 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc83)
|
| 57 |
+
%xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc84)
|
| 58 |
+
%xindex_13 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc84)
|
| 59 |
+
%xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc84)
|
| 60 |
+
%xindex_15 = tt.expand_dims %xindex_13 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc84)
|
| 61 |
+
%xindex_16 = tt.splat %xoffset_12 : i32 -> tensor<8x1xi32, #blocked> loc(#loc85)
|
| 62 |
+
%xindex_17 = tt.splat %xoffset_12 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc85)
|
| 63 |
+
%xindex_18 = arith.addi %xindex_16, %xindex_14 : tensor<8x1xi32, #blocked> loc(#loc85)
|
| 64 |
+
%xindex_19 = arith.addi %xindex_17, %xindex_15 : tensor<8x1xi32, #blocked1> loc(#loc85)
|
| 65 |
+
%xmask = arith.cmpi slt, %xindex_18, %cst_1 : tensor<8x1xi32, #blocked> loc(#loc86)
|
| 66 |
+
%xmask_20 = arith.cmpi slt, %xindex_19, %cst_2 : tensor<8x1xi32, #blocked1> loc(#loc86)
|
| 67 |
+
%r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc87)
|
| 68 |
+
%r0_index_21 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc87)
|
| 69 |
+
%r0_index_22 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc87)
|
| 70 |
+
%r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc87)
|
| 71 |
+
%r0_index_24 = tt.expand_dims %r0_index_21 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc87)
|
| 72 |
+
%r0_index_25 = tt.expand_dims %r0_index_22 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc87)
|
| 73 |
+
%x0 = arith.remsi %xindex_18, %cst_3 : tensor<8x1xi32, #blocked> loc(#loc88)
|
| 74 |
+
%x1 = arith.divsi %xindex_18, %cst_3 : tensor<8x1xi32, #blocked> loc(#loc89)
|
| 75 |
+
%tmp0 = arith.muli %r0_index_23, %cst_5 : tensor<1x16xi32, #blocked> loc(#loc90)
|
| 76 |
+
%tmp0_26 = tt.broadcast %x0 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc91)
|
| 77 |
+
%tmp0_27 = tt.broadcast %tmp0 : tensor<1x16xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc91)
|
| 78 |
+
%tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<8x16xi32, #blocked> loc(#loc91)
|
| 79 |
+
%tmp0_29 = arith.muli %x1, %cst_6 : tensor<8x1xi32, #blocked> loc(#loc92)
|
| 80 |
+
%tmp0_30 = tt.broadcast %tmp0_29 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc93)
|
| 81 |
+
%tmp0_31 = arith.addi %tmp0_28, %tmp0_30 : tensor<8x16xi32, #blocked> loc(#loc93)
|
| 82 |
+
%tmp0_32 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<8x16x!tt.ptr<i32>, #blocked> loc(#loc94)
|
| 83 |
+
%tmp0_33 = tt.addptr %tmp0_32, %tmp0_31 : tensor<8x16x!tt.ptr<i32>, #blocked>, tensor<8x16xi32, #blocked> loc(#loc94)
|
| 84 |
+
%tmp0_34 = tt.broadcast %xmask : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc95)
|
| 85 |
+
%tmp0_35 = tt.broadcast %xmask_20 : tensor<8x1xi1, #blocked1> -> tensor<8x16xi1, #blocked1> loc(#loc95)
|
| 86 |
+
%tmp0_36 = tt.load %tmp0_33, %tmp0_34, %cst_11 : tensor<8x16x!tt.ptr<i32>, #blocked> loc(#loc95)
|
| 87 |
+
%tmp2 = arith.trunci %r0_index_24 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc96)
|
| 88 |
+
%tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<8x16xi16, #linear> loc(#loc97)
|
| 89 |
+
%flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc146)
|
| 90 |
+
%flip_37 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc146)
|
| 91 |
+
%flip_38 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc146)
|
| 92 |
+
%flip_39 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc146)
|
| 93 |
+
%flip_40 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc146)
|
| 94 |
+
%flip_41 = tt.expand_dims %flip_37 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc146)
|
| 95 |
+
%flip_42 = tt.expand_dims %flip_38 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc146)
|
| 96 |
+
%flip_43 = tt.expand_dims %flip_39 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc146)
|
| 97 |
+
%flip_44 = tt.expand_dims %flip_40 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc146)
|
| 98 |
+
%flip_45 = tt.expand_dims %flip_41 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc146)
|
| 99 |
+
%flip_46 = tt.expand_dims %flip_42 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc146)
|
| 100 |
+
%flip_47 = tt.expand_dims %flip_43 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc146)
|
| 101 |
+
%flip_48 = tt.broadcast %flip_44 : tensor<1x2x1xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc147)
|
| 102 |
+
%flip_49 = tt.reshape %flip_48 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #blocked> loc(#loc148)
|
| 103 |
+
%flip_50 = tt.reshape %flip_48 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc148)
|
| 104 |
+
%y = tt.reshape %tmp0_36 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #linear1> loc(#loc154)
|
| 105 |
+
%left_mask = arith.subi %cst_7, %flip_45 : tensor<1x2x1xi32, #linear1> loc(#loc155)
|
| 106 |
+
%left_mask_51 = arith.subi %cst_8, %flip_44 : tensor<1x2x1xi32, #linear2> loc(#loc155)
|
| 107 |
+
%left_mask_52 = arith.subi %cst_9, %flip_46 : tensor<1x2x1xi32, #linear3> loc(#loc155)
|
| 108 |
+
%left_mask_53 = arith.subi %cst_10, %flip_47 : tensor<1x2x1xi32, #linear4> loc(#loc155)
|
| 109 |
+
%ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc156)
|
| 110 |
+
%ileft_54 = arith.muli %y, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156)
|
| 111 |
+
%ileft_55 = "tt.reduce"(%ileft_54) <{axis = 1 : i32}> ({
|
| 112 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 113 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 114 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 115 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
|
| 116 |
+
%ileft_56 = tt.expand_dims %ileft_55 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158)
|
| 117 |
+
%ileft_57 = tt.broadcast %ileft_56 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159)
|
| 118 |
+
%iright = tt.broadcast %flip_45 : tensor<1x2x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc160)
|
| 119 |
+
%iright_58 = arith.muli %y, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160)
|
| 120 |
+
%iright_59 = "tt.reduce"(%iright_58) <{axis = 1 : i32}> ({
|
| 121 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 122 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 123 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 124 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
|
| 125 |
+
%iright_60 = tt.expand_dims %iright_59 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162)
|
| 126 |
+
%iright_61 = tt.broadcast %iright_60 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163)
|
| 127 |
+
%ileft_62 = tt.reshape %ileft_57 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc164)
|
| 128 |
+
%ileft_63 = tt.reshape %ileft_57 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164)
|
| 129 |
+
%iright_64 = tt.reshape %iright_61 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc165)
|
| 130 |
+
%iright_65 = tt.reshape %iright_61 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165)
|
| 131 |
+
%y_idx = tt.reshape %tmp4 : tensor<8x16xi16, #linear> -> tensor<64x2x1xi16, #linear1> loc(#loc166)
|
| 132 |
+
%left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc167)
|
| 133 |
+
%left_idx_66 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<64x2x1xi16, #linear1> loc(#loc168)
|
| 134 |
+
%left_idx_67 = arith.muli %y_idx, %left_idx_66 : tensor<64x2x1xi16, #linear1> loc(#loc168)
|
| 135 |
+
%input = arith.extsi %left_idx_67 : tensor<64x2x1xi16, #linear1> to tensor<64x2x1xi32, #linear1> loc(#loc197)
|
| 136 |
+
%left_idx_68 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
|
| 137 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 138 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 139 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 140 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
|
| 141 |
+
%left_idx_69 = tt.expand_dims %left_idx_68 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170)
|
| 142 |
+
%left_idx_70 = tt.broadcast %left_idx_69 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171)
|
| 143 |
+
%right_idx = arith.trunci %flip_45 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc172)
|
| 144 |
+
%right_idx_71 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<64x2x1xi16, #linear1> loc(#loc173)
|
| 145 |
+
%right_idx_72 = arith.muli %y_idx, %right_idx_71 : tensor<64x2x1xi16, #linear1> loc(#loc173)
|
| 146 |
+
%input_73 = arith.extsi %right_idx_72 : tensor<64x2x1xi16, #linear1> to tensor<64x2x1xi32, #linear1> loc(#loc200)
|
| 147 |
+
%right_idx_74 = "tt.reduce"(%input_73) <{axis = 1 : i32}> ({
|
| 148 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 149 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 150 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 151 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
|
| 152 |
+
%right_idx_75 = tt.expand_dims %right_idx_74 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175)
|
| 153 |
+
%right_idx_76 = tt.broadcast %right_idx_75 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176)
|
| 154 |
+
%left_idx_77 = tt.reshape %left_idx_70 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc177)
|
| 155 |
+
%left_idx_78 = tt.reshape %left_idx_70 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177)
|
| 156 |
+
%right_idx_79 = tt.reshape %right_idx_76 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc178)
|
| 157 |
+
%right_idx_80 = tt.reshape %right_idx_76 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178)
|
| 158 |
+
%cond = arith.cmpi slt, %ileft_62, %iright_64 : tensor<8x16xi32, #blocked> loc(#loc179)
|
| 159 |
+
%cond_81 = arith.cmpi slt, %ileft_63, %iright_65 : tensor<8x16xi32, #linear> loc(#loc179)
|
| 160 |
+
%eq = arith.cmpi eq, %ileft_62, %iright_64 : tensor<8x16xi32, #blocked> loc(#loc180)
|
| 161 |
+
%eq_82 = arith.cmpi eq, %ileft_63, %iright_65 : tensor<8x16xi32, #linear> loc(#loc180)
|
| 162 |
+
%cond_83 = arith.cmpi sgt, %left_idx_77, %right_idx_79 : tensor<8x16xi32, #blocked> loc(#loc181)
|
| 163 |
+
%cond_84 = arith.cmpi sgt, %left_idx_78, %right_idx_80 : tensor<8x16xi32, #linear> loc(#loc181)
|
| 164 |
+
%cond_85 = arith.andi %eq, %cond_83 : tensor<8x16xi1, #blocked> loc(#loc182)
|
| 165 |
+
%cond_86 = arith.andi %eq_82, %cond_84 : tensor<8x16xi1, #linear> loc(#loc182)
|
| 166 |
+
%cond_87 = arith.ori %cond, %cond_85 : tensor<8x16xi1, #blocked> loc(#loc183)
|
| 167 |
+
%cond_88 = arith.ori %cond_81, %cond_86 : tensor<8x16xi1, #linear> loc(#loc183)
|
| 168 |
+
%cond_89 = arith.extui %cond_87 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc184)
|
| 169 |
+
%cond_90 = arith.extui %cond_88 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
|
| 170 |
+
%cond_91 = arith.xori %cond_89, %flip_49 : tensor<8x16xi32, #blocked> loc(#loc184)
|
| 171 |
+
%cond_92 = arith.xori %cond_90, %flip_50 : tensor<8x16xi32, #linear> loc(#loc184)
|
| 172 |
+
%cond_93 = arith.cmpi ne, %cond_91, %cst_11 : tensor<8x16xi32, #blocked> loc(#loc185)
|
| 173 |
+
%cond_94 = arith.cmpi ne, %cond_92, %cst : tensor<8x16xi32, #linear> loc(#loc185)
|
| 174 |
+
%ret = arith.xori %ileft_62, %iright_64 : tensor<8x16xi32, #blocked> loc(#loc186)
|
| 175 |
+
%ret_95 = arith.select %cond_93, %ret, %cst_11 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc187)
|
| 176 |
+
%ret_96 = arith.xori %tmp0_36, %ret_95 : tensor<8x16xi32, #blocked> loc(#loc188)
|
| 177 |
+
%ret_97 = ttg.convert_layout %ret_96 : tensor<8x16xi32, #blocked> -> tensor<8x16xi32, #linear> loc(#loc188)
|
| 178 |
+
%new_idxs = arith.xori %left_idx_78, %right_idx_80 : tensor<8x16xi32, #linear> loc(#loc189)
|
| 179 |
+
%new_idxs_98 = arith.select %cond_94, %new_idxs, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
|
| 180 |
+
%new_idxs_99 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc191)
|
| 181 |
+
%new_idxs_100 = tt.broadcast %new_idxs_99 : tensor<1x16xi32, #linear> -> tensor<8x16xi32, #linear> loc(#loc191)
|
| 182 |
+
%new_idxs_101 = arith.xori %new_idxs_100, %new_idxs_98 : tensor<8x16xi32, #linear> loc(#loc191)
|
| 183 |
+
%flip_102 = tt.broadcast %flip_46 : tensor<1x2x1xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc147)
|
| 184 |
+
%flip_103 = tt.reshape %flip_102 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc148)
|
| 185 |
+
%y_104 = tt.reshape %ret_96 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #linear2> loc(#loc154)
|
| 186 |
+
%ileft_105 = tt.broadcast %left_mask_51 : tensor<1x2x1xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc156)
|
| 187 |
+
%ileft_106 = arith.muli %y_104, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc156)
|
| 188 |
+
%ileft_107 = "tt.reduce"(%ileft_106) <{axis = 1 : i32}> ({
|
| 189 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 190 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 191 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 192 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
|
| 193 |
+
%ileft_108 = tt.expand_dims %ileft_107 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc158)
|
| 194 |
+
%ileft_109 = tt.broadcast %ileft_108 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc159)
|
| 195 |
+
%iright_110 = arith.muli %y_104, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc160)
|
| 196 |
+
%iright_111 = "tt.reduce"(%iright_110) <{axis = 1 : i32}> ({
|
| 197 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 198 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 199 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 200 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
|
| 201 |
+
%iright_112 = tt.expand_dims %iright_111 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc162)
|
| 202 |
+
%iright_113 = tt.broadcast %iright_112 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc163)
|
| 203 |
+
%ileft_114 = tt.reshape %ileft_109 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc164)
|
| 204 |
+
%iright_115 = tt.reshape %iright_113 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc165)
|
| 205 |
+
%y_idx_116 = tt.reshape %new_idxs_101 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc166)
|
| 206 |
+
%left_idx_117 = arith.muli %y_idx_116, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc168)
|
| 207 |
+
%left_idx_118 = "tt.reduce"(%left_idx_117) <{axis = 1 : i32}> ({
|
| 208 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 209 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 210 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 211 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
|
| 212 |
+
%left_idx_119 = tt.expand_dims %left_idx_118 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc170)
|
| 213 |
+
%left_idx_120 = tt.broadcast %left_idx_119 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc171)
|
| 214 |
+
%right_idx_121 = arith.muli %y_idx_116, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc173)
|
| 215 |
+
%right_idx_122 = "tt.reduce"(%right_idx_121) <{axis = 1 : i32}> ({
|
| 216 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 217 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 218 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 219 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
|
| 220 |
+
%right_idx_123 = tt.expand_dims %right_idx_122 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc175)
|
| 221 |
+
%right_idx_124 = tt.broadcast %right_idx_123 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc176)
|
| 222 |
+
%left_idx_125 = tt.reshape %left_idx_120 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc177)
|
| 223 |
+
%right_idx_126 = tt.reshape %right_idx_124 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc178)
|
| 224 |
+
%cond_127 = arith.cmpi slt, %ileft_114, %iright_115 : tensor<8x16xi32, #linear> loc(#loc179)
|
| 225 |
+
%eq_128 = arith.cmpi eq, %ileft_114, %iright_115 : tensor<8x16xi32, #linear> loc(#loc180)
|
| 226 |
+
%cond_129 = arith.cmpi sgt, %left_idx_125, %right_idx_126 : tensor<8x16xi32, #linear> loc(#loc181)
|
| 227 |
+
%cond_130 = arith.andi %eq_128, %cond_129 : tensor<8x16xi1, #linear> loc(#loc182)
|
| 228 |
+
%cond_131 = arith.ori %cond_127, %cond_130 : tensor<8x16xi1, #linear> loc(#loc183)
|
| 229 |
+
%cond_132 = arith.extui %cond_131 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
|
| 230 |
+
%cond_133 = arith.xori %cond_132, %flip_103 : tensor<8x16xi32, #linear> loc(#loc184)
|
| 231 |
+
%cond_134 = arith.cmpi ne, %cond_133, %cst : tensor<8x16xi32, #linear> loc(#loc185)
|
| 232 |
+
%ret_135 = arith.xori %ileft_114, %iright_115 : tensor<8x16xi32, #linear> loc(#loc186)
|
| 233 |
+
%ret_136 = arith.select %cond_134, %ret_135, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
|
| 234 |
+
%ret_137 = arith.xori %ret_97, %ret_136 : tensor<8x16xi32, #linear> loc(#loc188)
|
| 235 |
+
%new_idxs_138 = arith.xori %left_idx_125, %right_idx_126 : tensor<8x16xi32, #linear> loc(#loc189)
|
| 236 |
+
%new_idxs_139 = arith.select %cond_134, %new_idxs_138, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
|
| 237 |
+
%new_idxs_140 = arith.xori %new_idxs_101, %new_idxs_139 : tensor<8x16xi32, #linear> loc(#loc191)
|
| 238 |
+
%y_141 = tt.reshape %ret_137 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc154)
|
| 239 |
+
%ileft_142 = arith.muli %y_141, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156)
|
| 240 |
+
%ileft_143 = "tt.reduce"(%ileft_142) <{axis = 1 : i32}> ({
|
| 241 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 242 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 243 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 244 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
|
| 245 |
+
%ileft_144 = tt.expand_dims %ileft_143 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158)
|
| 246 |
+
%ileft_145 = tt.broadcast %ileft_144 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159)
|
| 247 |
+
%iright_146 = arith.muli %y_141, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160)
|
| 248 |
+
%iright_147 = "tt.reduce"(%iright_146) <{axis = 1 : i32}> ({
|
| 249 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 250 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 251 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 252 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
|
| 253 |
+
%iright_148 = tt.expand_dims %iright_147 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162)
|
| 254 |
+
%iright_149 = tt.broadcast %iright_148 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163)
|
| 255 |
+
%ileft_150 = tt.reshape %ileft_145 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164)
|
| 256 |
+
%iright_151 = tt.reshape %iright_149 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165)
|
| 257 |
+
%y_idx_152 = tt.reshape %new_idxs_140 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc166)
|
| 258 |
+
%left_idx_153 = arith.muli %y_idx_152, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc168)
|
| 259 |
+
%left_idx_154 = "tt.reduce"(%left_idx_153) <{axis = 1 : i32}> ({
|
| 260 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 261 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 262 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 263 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
|
| 264 |
+
%left_idx_155 = tt.expand_dims %left_idx_154 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170)
|
| 265 |
+
%left_idx_156 = tt.broadcast %left_idx_155 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171)
|
| 266 |
+
%right_idx_157 = arith.muli %y_idx_152, %iright : tensor<64x2x1xi32, #linear1> loc(#loc173)
|
| 267 |
+
%right_idx_158 = "tt.reduce"(%right_idx_157) <{axis = 1 : i32}> ({
|
| 268 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 269 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 270 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 271 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
|
| 272 |
+
%right_idx_159 = tt.expand_dims %right_idx_158 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175)
|
| 273 |
+
%right_idx_160 = tt.broadcast %right_idx_159 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176)
|
| 274 |
+
%left_idx_161 = tt.reshape %left_idx_156 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177)
|
| 275 |
+
%right_idx_162 = tt.reshape %right_idx_160 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178)
|
| 276 |
+
%cond_163 = arith.cmpi slt, %ileft_150, %iright_151 : tensor<8x16xi32, #linear> loc(#loc179)
|
| 277 |
+
%eq_164 = arith.cmpi eq, %ileft_150, %iright_151 : tensor<8x16xi32, #linear> loc(#loc180)
|
| 278 |
+
%cond_165 = arith.cmpi sgt, %left_idx_161, %right_idx_162 : tensor<8x16xi32, #linear> loc(#loc181)
|
| 279 |
+
%cond_166 = arith.andi %eq_164, %cond_165 : tensor<8x16xi1, #linear> loc(#loc182)
|
| 280 |
+
%cond_167 = arith.ori %cond_163, %cond_166 : tensor<8x16xi1, #linear> loc(#loc183)
|
| 281 |
+
%cond_168 = arith.extui %cond_167 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
|
| 282 |
+
%cond_169 = arith.xori %cond_168, %flip_103 : tensor<8x16xi32, #linear> loc(#loc184)
|
| 283 |
+
%cond_170 = arith.cmpi ne, %cond_169, %cst : tensor<8x16xi32, #linear> loc(#loc185)
|
| 284 |
+
%ret_171 = arith.xori %ileft_150, %iright_151 : tensor<8x16xi32, #linear> loc(#loc186)
|
| 285 |
+
%ret_172 = arith.select %cond_170, %ret_171, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
|
| 286 |
+
%ret_173 = arith.xori %ret_137, %ret_172 : tensor<8x16xi32, #linear> loc(#loc188)
|
| 287 |
+
%new_idxs_174 = arith.xori %left_idx_161, %right_idx_162 : tensor<8x16xi32, #linear> loc(#loc189)
|
| 288 |
+
%new_idxs_175 = arith.select %cond_170, %new_idxs_174, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
|
| 289 |
+
%new_idxs_176 = arith.xori %new_idxs_140, %new_idxs_175 : tensor<8x16xi32, #linear> loc(#loc191)
|
| 290 |
+
%flip_177 = tt.broadcast %flip_47 : tensor<1x2x1xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc147)
|
| 291 |
+
%flip_178 = tt.reshape %flip_177 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc148)
|
| 292 |
+
%y_179 = tt.reshape %ret_173 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc154)
|
| 293 |
+
%ileft_180 = tt.broadcast %left_mask_52 : tensor<1x2x1xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc156)
|
| 294 |
+
%ileft_181 = arith.muli %y_179, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc156)
|
| 295 |
+
%ileft_182 = "tt.reduce"(%ileft_181) <{axis = 1 : i32}> ({
|
| 296 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 297 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 298 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 299 |
+
}) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193)
|
| 300 |
+
%ileft_183 = tt.expand_dims %ileft_182 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc158)
|
| 301 |
+
%ileft_184 = tt.broadcast %ileft_183 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc159)
|
| 302 |
+
%iright_185 = arith.muli %y_179, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc160)
|
| 303 |
+
%iright_186 = "tt.reduce"(%iright_185) <{axis = 1 : i32}> ({
|
| 304 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 305 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 306 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 307 |
+
}) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195)
|
| 308 |
+
%iright_187 = tt.expand_dims %iright_186 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc162)
|
| 309 |
+
%iright_188 = tt.broadcast %iright_187 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc163)
|
| 310 |
+
%ileft_189 = tt.reshape %ileft_184 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc164)
|
| 311 |
+
%iright_190 = tt.reshape %iright_188 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc165)
|
| 312 |
+
%y_idx_191 = tt.reshape %new_idxs_176 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc166)
|
| 313 |
+
%left_idx_192 = arith.muli %y_idx_191, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc168)
|
| 314 |
+
%left_idx_193 = "tt.reduce"(%left_idx_192) <{axis = 1 : i32}> ({
|
| 315 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 316 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 317 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 318 |
+
}) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198)
|
| 319 |
+
%left_idx_194 = tt.expand_dims %left_idx_193 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc170)
|
| 320 |
+
%left_idx_195 = tt.broadcast %left_idx_194 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc171)
|
| 321 |
+
%right_idx_196 = arith.muli %y_idx_191, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc173)
|
| 322 |
+
%right_idx_197 = "tt.reduce"(%right_idx_196) <{axis = 1 : i32}> ({
|
| 323 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 324 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 325 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 326 |
+
}) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201)
|
| 327 |
+
%right_idx_198 = tt.expand_dims %right_idx_197 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc175)
|
| 328 |
+
%right_idx_199 = tt.broadcast %right_idx_198 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc176)
|
| 329 |
+
%left_idx_200 = tt.reshape %left_idx_195 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc177)
|
| 330 |
+
%right_idx_201 = tt.reshape %right_idx_199 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc178)
|
| 331 |
+
%cond_202 = arith.cmpi slt, %ileft_189, %iright_190 : tensor<8x16xi32, #linear> loc(#loc179)
|
| 332 |
+
%eq_203 = arith.cmpi eq, %ileft_189, %iright_190 : tensor<8x16xi32, #linear> loc(#loc180)
|
| 333 |
+
%cond_204 = arith.cmpi sgt, %left_idx_200, %right_idx_201 : tensor<8x16xi32, #linear> loc(#loc181)
|
| 334 |
+
%cond_205 = arith.andi %eq_203, %cond_204 : tensor<8x16xi1, #linear> loc(#loc182)
|
| 335 |
+
%cond_206 = arith.ori %cond_202, %cond_205 : tensor<8x16xi1, #linear> loc(#loc183)
|
| 336 |
+
%cond_207 = arith.extui %cond_206 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
|
| 337 |
+
%cond_208 = arith.xori %cond_207, %flip_178 : tensor<8x16xi32, #linear> loc(#loc184)
|
| 338 |
+
%cond_209 = arith.cmpi ne, %cond_208, %cst : tensor<8x16xi32, #linear> loc(#loc185)
|
| 339 |
+
%ret_210 = arith.xori %ileft_189, %iright_190 : tensor<8x16xi32, #linear> loc(#loc186)
|
| 340 |
+
%ret_211 = arith.select %cond_209, %ret_210, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
|
| 341 |
+
%ret_212 = arith.xori %ret_173, %ret_211 : tensor<8x16xi32, #linear> loc(#loc188)
|
| 342 |
+
%new_idxs_213 = arith.xori %left_idx_200, %right_idx_201 : tensor<8x16xi32, #linear> loc(#loc189)
|
| 343 |
+
%new_idxs_214 = arith.select %cond_209, %new_idxs_213, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
|
| 344 |
+
%new_idxs_215 = arith.xori %new_idxs_176, %new_idxs_214 : tensor<8x16xi32, #linear> loc(#loc191)
|
| 345 |
+
%y_216 = tt.reshape %ret_212 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc154)
|
| 346 |
+
%ileft_217 = arith.muli %y_216, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc156)
|
| 347 |
+
%ileft_218 = "tt.reduce"(%ileft_217) <{axis = 1 : i32}> ({
|
| 348 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 349 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 350 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 351 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
|
| 352 |
+
%ileft_219 = tt.expand_dims %ileft_218 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc158)
|
| 353 |
+
%ileft_220 = tt.broadcast %ileft_219 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc159)
|
| 354 |
+
%iright_221 = arith.muli %y_216, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc160)
|
| 355 |
+
%iright_222 = "tt.reduce"(%iright_221) <{axis = 1 : i32}> ({
|
| 356 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 357 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 358 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 359 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
|
| 360 |
+
%iright_223 = tt.expand_dims %iright_222 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc162)
|
| 361 |
+
%iright_224 = tt.broadcast %iright_223 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc163)
|
| 362 |
+
%ileft_225 = tt.reshape %ileft_220 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc164)
|
| 363 |
+
%iright_226 = tt.reshape %iright_224 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc165)
|
| 364 |
+
%y_idx_227 = tt.reshape %new_idxs_215 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc166)
|
| 365 |
+
%left_idx_228 = arith.muli %y_idx_227, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc168)
|
| 366 |
+
%left_idx_229 = "tt.reduce"(%left_idx_228) <{axis = 1 : i32}> ({
|
| 367 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 368 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 369 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 370 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
|
| 371 |
+
%left_idx_230 = tt.expand_dims %left_idx_229 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc170)
|
| 372 |
+
%left_idx_231 = tt.broadcast %left_idx_230 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc171)
|
| 373 |
+
%right_idx_232 = arith.muli %y_idx_227, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc173)
|
| 374 |
+
%right_idx_233 = "tt.reduce"(%right_idx_232) <{axis = 1 : i32}> ({
|
| 375 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 376 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 377 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 378 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
|
| 379 |
+
%right_idx_234 = tt.expand_dims %right_idx_233 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc175)
|
| 380 |
+
%right_idx_235 = tt.broadcast %right_idx_234 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc176)
|
| 381 |
+
%left_idx_236 = tt.reshape %left_idx_231 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc177)
|
| 382 |
+
%right_idx_237 = tt.reshape %right_idx_235 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc178)
|
| 383 |
+
%cond_238 = arith.cmpi slt, %ileft_225, %iright_226 : tensor<8x16xi32, #linear> loc(#loc179)
|
| 384 |
+
%eq_239 = arith.cmpi eq, %ileft_225, %iright_226 : tensor<8x16xi32, #linear> loc(#loc180)
|
| 385 |
+
%cond_240 = arith.cmpi sgt, %left_idx_236, %right_idx_237 : tensor<8x16xi32, #linear> loc(#loc181)
|
| 386 |
+
%cond_241 = arith.andi %eq_239, %cond_240 : tensor<8x16xi1, #linear> loc(#loc182)
|
| 387 |
+
%cond_242 = arith.ori %cond_238, %cond_241 : tensor<8x16xi1, #linear> loc(#loc183)
|
| 388 |
+
%cond_243 = arith.extui %cond_242 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
|
| 389 |
+
%cond_244 = arith.xori %cond_243, %flip_178 : tensor<8x16xi32, #linear> loc(#loc184)
|
| 390 |
+
%cond_245 = arith.cmpi ne, %cond_244, %cst : tensor<8x16xi32, #linear> loc(#loc185)
|
| 391 |
+
%ret_246 = arith.xori %ileft_225, %iright_226 : tensor<8x16xi32, #linear> loc(#loc186)
|
| 392 |
+
%ret_247 = arith.select %cond_245, %ret_246, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
|
| 393 |
+
%ret_248 = arith.xori %ret_212, %ret_247 : tensor<8x16xi32, #linear> loc(#loc188)
|
| 394 |
+
%new_idxs_249 = arith.xori %left_idx_236, %right_idx_237 : tensor<8x16xi32, #linear> loc(#loc189)
|
| 395 |
+
%new_idxs_250 = arith.select %cond_245, %new_idxs_249, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
|
| 396 |
+
%new_idxs_251 = arith.xori %new_idxs_215, %new_idxs_250 : tensor<8x16xi32, #linear> loc(#loc191)
|
| 397 |
+
%y_252 = tt.reshape %ret_248 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc154)
|
| 398 |
+
%ileft_253 = arith.muli %y_252, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156)
|
| 399 |
+
%ileft_254 = "tt.reduce"(%ileft_253) <{axis = 1 : i32}> ({
|
| 400 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 401 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 402 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 403 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
|
| 404 |
+
%ileft_255 = tt.expand_dims %ileft_254 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158)
|
| 405 |
+
%ileft_256 = tt.broadcast %ileft_255 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159)
|
| 406 |
+
%iright_257 = arith.muli %y_252, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160)
|
| 407 |
+
%iright_258 = "tt.reduce"(%iright_257) <{axis = 1 : i32}> ({
|
| 408 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 409 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 410 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 411 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
|
| 412 |
+
%iright_259 = tt.expand_dims %iright_258 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162)
|
| 413 |
+
%iright_260 = tt.broadcast %iright_259 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163)
|
| 414 |
+
%ileft_261 = tt.reshape %ileft_256 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164)
|
| 415 |
+
%iright_262 = tt.reshape %iright_260 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165)
|
| 416 |
+
%y_idx_263 = tt.reshape %new_idxs_251 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc166)
|
| 417 |
+
%left_idx_264 = arith.muli %y_idx_263, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc168)
|
| 418 |
+
%left_idx_265 = "tt.reduce"(%left_idx_264) <{axis = 1 : i32}> ({
|
| 419 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 420 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 421 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 422 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
|
| 423 |
+
%left_idx_266 = tt.expand_dims %left_idx_265 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170)
|
| 424 |
+
%left_idx_267 = tt.broadcast %left_idx_266 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171)
|
| 425 |
+
%right_idx_268 = arith.muli %y_idx_263, %iright : tensor<64x2x1xi32, #linear1> loc(#loc173)
|
| 426 |
+
%right_idx_269 = "tt.reduce"(%right_idx_268) <{axis = 1 : i32}> ({
|
| 427 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 428 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 429 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 430 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
|
| 431 |
+
%right_idx_270 = tt.expand_dims %right_idx_269 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175)
|
| 432 |
+
%right_idx_271 = tt.broadcast %right_idx_270 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176)
|
| 433 |
+
%left_idx_272 = tt.reshape %left_idx_267 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177)
|
| 434 |
+
%right_idx_273 = tt.reshape %right_idx_271 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178)
|
| 435 |
+
%cond_274 = arith.cmpi slt, %ileft_261, %iright_262 : tensor<8x16xi32, #linear> loc(#loc179)
|
| 436 |
+
%eq_275 = arith.cmpi eq, %ileft_261, %iright_262 : tensor<8x16xi32, #linear> loc(#loc180)
|
| 437 |
+
%cond_276 = arith.cmpi sgt, %left_idx_272, %right_idx_273 : tensor<8x16xi32, #linear> loc(#loc181)
|
| 438 |
+
%cond_277 = arith.andi %eq_275, %cond_276 : tensor<8x16xi1, #linear> loc(#loc182)
|
| 439 |
+
%cond_278 = arith.ori %cond_274, %cond_277 : tensor<8x16xi1, #linear> loc(#loc183)
|
| 440 |
+
%cond_279 = arith.extui %cond_278 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184)
|
| 441 |
+
%cond_280 = arith.xori %cond_279, %flip_178 : tensor<8x16xi32, #linear> loc(#loc184)
|
| 442 |
+
%cond_281 = arith.cmpi ne, %cond_280, %cst : tensor<8x16xi32, #linear> loc(#loc185)
|
| 443 |
+
%ret_282 = arith.xori %ileft_261, %iright_262 : tensor<8x16xi32, #linear> loc(#loc186)
|
| 444 |
+
%ret_283 = arith.select %cond_281, %ret_282, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
|
| 445 |
+
%ret_284 = arith.xori %ret_248, %ret_283 : tensor<8x16xi32, #linear> loc(#loc188)
|
| 446 |
+
%new_idxs_285 = arith.xori %left_idx_272, %right_idx_273 : tensor<8x16xi32, #linear> loc(#loc189)
|
| 447 |
+
%new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
|
| 448 |
+
%new_idxs_287 = arith.xori %new_idxs_251, %new_idxs_286 : tensor<8x16xi32, #linear> loc(#loc191)
|
| 449 |
+
%y_288 = tt.reshape %ret_284 : tensor<8x16xi32, #linear> -> tensor<8x2x8xi32, #linear4> loc(#loc154)
|
| 450 |
+
%ileft_289 = tt.broadcast %left_mask_53 : tensor<1x2x1xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc156)
|
| 451 |
+
%ileft_290 = arith.muli %y_288, %ileft_289 : tensor<8x2x8xi32, #linear4> loc(#loc156)
|
| 452 |
+
%ileft_291 = "tt.reduce"(%ileft_290) <{axis = 1 : i32}> ({
|
| 453 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 454 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 455 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 456 |
+
}) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc193)
|
| 457 |
+
%ileft_292 = tt.expand_dims %ileft_291 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc158)
|
| 458 |
+
%ileft_293 = tt.broadcast %ileft_292 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc159)
|
| 459 |
+
%iright_294 = arith.muli %y_288, %flip_177 : tensor<8x2x8xi32, #linear4> loc(#loc160)
|
| 460 |
+
%iright_295 = "tt.reduce"(%iright_294) <{axis = 1 : i32}> ({
|
| 461 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 462 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 463 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 464 |
+
}) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc195)
|
| 465 |
+
%iright_296 = tt.expand_dims %iright_295 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc162)
|
| 466 |
+
%iright_297 = tt.broadcast %iright_296 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc163)
|
| 467 |
+
%ileft_298 = tt.reshape %ileft_293 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc164)
|
| 468 |
+
%iright_299 = tt.reshape %iright_297 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc165)
|
| 469 |
+
%y_idx_300 = tt.reshape %new_idxs_287 : tensor<8x16xi32, #linear> -> tensor<8x2x8xi32, #linear4> loc(#loc166)
|
| 470 |
+
%left_idx_301 = arith.muli %y_idx_300, %ileft_289 : tensor<8x2x8xi32, #linear4> loc(#loc168)
|
| 471 |
+
%left_idx_302 = "tt.reduce"(%left_idx_301) <{axis = 1 : i32}> ({
|
| 472 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 473 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 474 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 475 |
+
}) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc198)
|
| 476 |
+
%left_idx_303 = tt.expand_dims %left_idx_302 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc170)
|
| 477 |
+
%left_idx_304 = tt.broadcast %left_idx_303 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc171)
|
| 478 |
+
%right_idx_305 = arith.muli %y_idx_300, %flip_177 : tensor<8x2x8xi32, #linear4> loc(#loc173)
|
| 479 |
+
%right_idx_306 = "tt.reduce"(%right_idx_305) <{axis = 1 : i32}> ({
|
| 480 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 481 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 482 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 483 |
+
}) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc201)
|
| 484 |
+
%right_idx_307 = tt.expand_dims %right_idx_306 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc175)
|
| 485 |
+
%right_idx_308 = tt.broadcast %right_idx_307 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc176)
|
| 486 |
+
%left_idx_309 = tt.reshape %left_idx_304 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc177)
|
| 487 |
+
%right_idx_310 = tt.reshape %right_idx_308 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc178)
|
| 488 |
+
%cond_311 = arith.cmpi slt, %ileft_298, %iright_299 : tensor<8x16xi32, #linear> loc(#loc179)
|
| 489 |
+
%eq_312 = arith.cmpi eq, %ileft_298, %iright_299 : tensor<8x16xi32, #linear> loc(#loc180)
|
| 490 |
+
%cond_313 = arith.cmpi sgt, %left_idx_309, %right_idx_310 : tensor<8x16xi32, #linear> loc(#loc181)
|
| 491 |
+
%cond_314 = arith.andi %eq_312, %cond_313 : tensor<8x16xi1, #linear> loc(#loc182)
|
| 492 |
+
%cond_315 = arith.ori %cond_311, %cond_314 : tensor<8x16xi1, #linear> loc(#loc183)
|
| 493 |
+
%ret_316 = arith.xori %ileft_298, %iright_299 : tensor<8x16xi32, #linear> loc(#loc186)
|
| 494 |
+
%ret_317 = arith.select %cond_315, %ret_316, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
|
| 495 |
+
%ret_318 = arith.xori %ret_284, %ret_317 : tensor<8x16xi32, #linear> loc(#loc188)
|
| 496 |
+
%new_idxs_319 = arith.xori %left_idx_309, %right_idx_310 : tensor<8x16xi32, #linear> loc(#loc189)
|
| 497 |
+
%new_idxs_320 = arith.select %cond_315, %new_idxs_319, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
|
| 498 |
+
%new_idxs_321 = arith.xori %new_idxs_287, %new_idxs_320 : tensor<8x16xi32, #linear> loc(#loc191)
|
| 499 |
+
%y_322 = tt.reshape %ret_318 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc154)
|
| 500 |
+
%ileft_323 = arith.muli %y_322, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc156)
|
| 501 |
+
%ileft_324 = "tt.reduce"(%ileft_323) <{axis = 1 : i32}> ({
|
| 502 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 503 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 504 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 505 |
+
}) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193)
|
| 506 |
+
%ileft_325 = tt.expand_dims %ileft_324 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc158)
|
| 507 |
+
%ileft_326 = tt.broadcast %ileft_325 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc159)
|
| 508 |
+
%iright_327 = arith.muli %y_322, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc160)
|
| 509 |
+
%iright_328 = "tt.reduce"(%iright_327) <{axis = 1 : i32}> ({
|
| 510 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 511 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 512 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 513 |
+
}) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195)
|
| 514 |
+
%iright_329 = tt.expand_dims %iright_328 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc162)
|
| 515 |
+
%iright_330 = tt.broadcast %iright_329 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc163)
|
| 516 |
+
%ileft_331 = tt.reshape %ileft_326 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc164)
|
| 517 |
+
%iright_332 = tt.reshape %iright_330 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc165)
|
| 518 |
+
%y_idx_333 = tt.reshape %new_idxs_321 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc166)
|
| 519 |
+
%left_idx_334 = arith.muli %y_idx_333, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc168)
|
| 520 |
+
%left_idx_335 = "tt.reduce"(%left_idx_334) <{axis = 1 : i32}> ({
|
| 521 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 522 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 523 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 524 |
+
}) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198)
|
| 525 |
+
%left_idx_336 = tt.expand_dims %left_idx_335 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc170)
|
| 526 |
+
%left_idx_337 = tt.broadcast %left_idx_336 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc171)
|
| 527 |
+
%right_idx_338 = arith.muli %y_idx_333, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc173)
|
| 528 |
+
%right_idx_339 = "tt.reduce"(%right_idx_338) <{axis = 1 : i32}> ({
|
| 529 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 530 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 531 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 532 |
+
}) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201)
|
| 533 |
+
%right_idx_340 = tt.expand_dims %right_idx_339 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc175)
|
| 534 |
+
%right_idx_341 = tt.broadcast %right_idx_340 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc176)
|
| 535 |
+
%left_idx_342 = tt.reshape %left_idx_337 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc177)
|
| 536 |
+
%right_idx_343 = tt.reshape %right_idx_341 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc178)
|
| 537 |
+
%cond_344 = arith.cmpi slt, %ileft_331, %iright_332 : tensor<8x16xi32, #linear> loc(#loc179)
|
| 538 |
+
%eq_345 = arith.cmpi eq, %ileft_331, %iright_332 : tensor<8x16xi32, #linear> loc(#loc180)
|
| 539 |
+
%cond_346 = arith.cmpi sgt, %left_idx_342, %right_idx_343 : tensor<8x16xi32, #linear> loc(#loc181)
|
| 540 |
+
%cond_347 = arith.andi %eq_345, %cond_346 : tensor<8x16xi1, #linear> loc(#loc182)
|
| 541 |
+
%cond_348 = arith.ori %cond_344, %cond_347 : tensor<8x16xi1, #linear> loc(#loc183)
|
| 542 |
+
%ret_349 = arith.xori %ileft_331, %iright_332 : tensor<8x16xi32, #linear> loc(#loc186)
|
| 543 |
+
%ret_350 = arith.select %cond_348, %ret_349, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
|
| 544 |
+
%ret_351 = arith.xori %ret_318, %ret_350 : tensor<8x16xi32, #linear> loc(#loc188)
|
| 545 |
+
%new_idxs_352 = arith.xori %left_idx_342, %right_idx_343 : tensor<8x16xi32, #linear> loc(#loc189)
|
| 546 |
+
%new_idxs_353 = arith.select %cond_348, %new_idxs_352, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
|
| 547 |
+
%new_idxs_354 = arith.xori %new_idxs_321, %new_idxs_353 : tensor<8x16xi32, #linear> loc(#loc191)
|
| 548 |
+
%y_355 = tt.reshape %ret_351 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc154)
|
| 549 |
+
%ileft_356 = arith.muli %y_355, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc156)
|
| 550 |
+
%ileft_357 = "tt.reduce"(%ileft_356) <{axis = 1 : i32}> ({
|
| 551 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 552 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 553 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 554 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
|
| 555 |
+
%ileft_358 = tt.expand_dims %ileft_357 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc158)
|
| 556 |
+
%ileft_359 = tt.broadcast %ileft_358 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc159)
|
| 557 |
+
%iright_360 = arith.muli %y_355, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc160)
|
| 558 |
+
%iright_361 = "tt.reduce"(%iright_360) <{axis = 1 : i32}> ({
|
| 559 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 560 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 561 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 562 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
|
| 563 |
+
%iright_362 = tt.expand_dims %iright_361 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc162)
|
| 564 |
+
%iright_363 = tt.broadcast %iright_362 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc163)
|
| 565 |
+
%ileft_364 = tt.reshape %ileft_359 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc164)
|
| 566 |
+
%iright_365 = tt.reshape %iright_363 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc165)
|
| 567 |
+
%y_idx_366 = tt.reshape %new_idxs_354 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc166)
|
| 568 |
+
%left_idx_367 = arith.muli %y_idx_366, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc168)
|
| 569 |
+
%left_idx_368 = "tt.reduce"(%left_idx_367) <{axis = 1 : i32}> ({
|
| 570 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 571 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 572 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 573 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
|
| 574 |
+
%left_idx_369 = tt.expand_dims %left_idx_368 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc170)
|
| 575 |
+
%left_idx_370 = tt.broadcast %left_idx_369 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc171)
|
| 576 |
+
%right_idx_371 = arith.muli %y_idx_366, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc173)
|
| 577 |
+
%right_idx_372 = "tt.reduce"(%right_idx_371) <{axis = 1 : i32}> ({
|
| 578 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 579 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 580 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 581 |
+
}) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
|
| 582 |
+
%right_idx_373 = tt.expand_dims %right_idx_372 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc175)
|
| 583 |
+
%right_idx_374 = tt.broadcast %right_idx_373 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc176)
|
| 584 |
+
%left_idx_375 = tt.reshape %left_idx_370 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc177)
|
| 585 |
+
%right_idx_376 = tt.reshape %right_idx_374 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc178)
|
| 586 |
+
%cond_377 = arith.cmpi slt, %ileft_364, %iright_365 : tensor<8x16xi32, #linear> loc(#loc179)
|
| 587 |
+
%eq_378 = arith.cmpi eq, %ileft_364, %iright_365 : tensor<8x16xi32, #linear> loc(#loc180)
|
| 588 |
+
%cond_379 = arith.cmpi sgt, %left_idx_375, %right_idx_376 : tensor<8x16xi32, #linear> loc(#loc181)
|
| 589 |
+
%cond_380 = arith.andi %eq_378, %cond_379 : tensor<8x16xi1, #linear> loc(#loc182)
|
| 590 |
+
%cond_381 = arith.ori %cond_377, %cond_380 : tensor<8x16xi1, #linear> loc(#loc183)
|
| 591 |
+
%ret_382 = arith.xori %ileft_364, %iright_365 : tensor<8x16xi32, #linear> loc(#loc186)
|
| 592 |
+
%ret_383 = arith.select %cond_381, %ret_382, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187)
|
| 593 |
+
%ret_384 = arith.xori %ret_351, %ret_383 : tensor<8x16xi32, #linear> loc(#loc188)
|
| 594 |
+
%new_idxs_385 = arith.xori %left_idx_375, %right_idx_376 : tensor<8x16xi32, #linear> loc(#loc189)
|
| 595 |
+
%new_idxs_386 = arith.select %cond_381, %new_idxs_385, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
|
| 596 |
+
%new_idxs_387 = arith.xori %new_idxs_354, %new_idxs_386 : tensor<8x16xi32, #linear> loc(#loc191)
|
| 597 |
+
%y_388 = tt.reshape %ret_384 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc154)
|
| 598 |
+
%ileft_389 = arith.muli %y_388, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156)
|
| 599 |
+
%ileft_390 = "tt.reduce"(%ileft_389) <{axis = 1 : i32}> ({
|
| 600 |
+
^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
|
| 601 |
+
%ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
|
| 602 |
+
tt.reduce.return %ileft_421 : i32 loc(#loc193)
|
| 603 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
|
| 604 |
+
%ileft_391 = tt.expand_dims %ileft_390 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158)
|
| 605 |
+
%ileft_392 = tt.broadcast %ileft_391 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159)
|
| 606 |
+
%iright_393 = arith.muli %y_388, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160)
|
| 607 |
+
%iright_394 = "tt.reduce"(%iright_393) <{axis = 1 : i32}> ({
|
| 608 |
+
^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
|
| 609 |
+
%iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
|
| 610 |
+
tt.reduce.return %iright_421 : i32 loc(#loc195)
|
| 611 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
|
| 612 |
+
%iright_395 = tt.expand_dims %iright_394 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162)
|
| 613 |
+
%iright_396 = tt.broadcast %iright_395 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163)
|
| 614 |
+
%ileft_397 = tt.reshape %ileft_392 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164)
|
| 615 |
+
%iright_398 = tt.reshape %iright_396 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165)
|
| 616 |
+
%y_idx_399 = tt.reshape %new_idxs_387 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc166)
|
| 617 |
+
%left_idx_400 = arith.muli %y_idx_399, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc168)
|
| 618 |
+
%left_idx_401 = "tt.reduce"(%left_idx_400) <{axis = 1 : i32}> ({
|
| 619 |
+
^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
|
| 620 |
+
%left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
|
| 621 |
+
tt.reduce.return %left_idx_421 : i32 loc(#loc198)
|
| 622 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
|
| 623 |
+
%left_idx_402 = tt.expand_dims %left_idx_401 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170)
|
| 624 |
+
%left_idx_403 = tt.broadcast %left_idx_402 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171)
|
| 625 |
+
%right_idx_404 = arith.muli %y_idx_399, %iright : tensor<64x2x1xi32, #linear1> loc(#loc173)
|
| 626 |
+
%right_idx_405 = "tt.reduce"(%right_idx_404) <{axis = 1 : i32}> ({
|
| 627 |
+
^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
|
| 628 |
+
%right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
|
| 629 |
+
tt.reduce.return %right_idx_421 : i32 loc(#loc201)
|
| 630 |
+
}) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
|
| 631 |
+
%right_idx_406 = tt.expand_dims %right_idx_405 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175)
|
| 632 |
+
%right_idx_407 = tt.broadcast %right_idx_406 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176)
|
| 633 |
+
%left_idx_408 = tt.reshape %left_idx_403 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177)
|
| 634 |
+
%right_idx_409 = tt.reshape %right_idx_407 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178)
|
| 635 |
+
%cond_410 = arith.cmpi slt, %ileft_397, %iright_398 : tensor<8x16xi32, #linear> loc(#loc179)
|
| 636 |
+
%eq_411 = arith.cmpi eq, %ileft_397, %iright_398 : tensor<8x16xi32, #linear> loc(#loc180)
|
| 637 |
+
%cond_412 = arith.cmpi sgt, %left_idx_408, %right_idx_409 : tensor<8x16xi32, #linear> loc(#loc181)
|
| 638 |
+
%cond_413 = arith.andi %eq_411, %cond_412 : tensor<8x16xi1, #linear> loc(#loc182)
|
| 639 |
+
%cond_414 = arith.ori %cond_410, %cond_413 : tensor<8x16xi1, #linear> loc(#loc183)
|
| 640 |
+
%new_idxs_415 = arith.xori %left_idx_408, %right_idx_409 : tensor<8x16xi32, #linear> loc(#loc189)
|
| 641 |
+
%new_idxs_416 = arith.select %cond_414, %new_idxs_415, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190)
|
| 642 |
+
%new_idxs_417 = arith.xori %new_idxs_387, %new_idxs_416 : tensor<8x16xi32, #linear> loc(#loc191)
|
| 643 |
+
%tmp7 = arith.extsi %tmp0_36 : tensor<8x16xi32, #blocked> to tensor<8x16xi64, #blocked> loc(#loc141)
|
| 644 |
+
%tmp10 = arith.select %tmp0_34, %tmp7, %cst_0 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc142)
|
| 645 |
+
%tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({
|
| 646 |
+
^bb0(%tmp11_419: i64 loc(callsite(#loc1 at #loc143)), %tmp11_420: i64 loc(callsite(#loc1 at #loc143))):
|
| 647 |
+
%tmp11_421 = arith.addi %tmp11_419, %tmp11_420 : i64 loc(#loc192)
|
| 648 |
+
tt.reduce.return %tmp11_421 : i64 loc(#loc152)
|
| 649 |
+
}) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc152)
|
| 650 |
+
%tmp11_418 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc144)
|
| 651 |
+
%tmp14 = arith.trunci %tmp11_418 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc145)
|
| 652 |
+
%0 = arith.muli %xindex_19, %cst_4 : tensor<8x1xi32, #blocked1> loc(#loc70)
|
| 653 |
+
%1 = tt.broadcast %r0_index_25 : tensor<1x16xi32, #blocked1> -> tensor<8x16xi32, #blocked1> loc(#loc71)
|
| 654 |
+
%2 = tt.broadcast %0 : tensor<8x1xi32, #blocked1> -> tensor<8x16xi32, #blocked1> loc(#loc71)
|
| 655 |
+
%3 = arith.addi %1, %2 : tensor<8x16xi32, #blocked1> loc(#loc71)
|
| 656 |
+
%4 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<8x16x!tt.ptr<i32>, #blocked1> loc(#loc72)
|
| 657 |
+
%5 = tt.addptr %4, %3 : tensor<8x16x!tt.ptr<i32>, #blocked1>, tensor<8x16xi32, #blocked1> loc(#loc72)
|
| 658 |
+
%6 = ttg.convert_layout %new_idxs_417 : tensor<8x16xi32, #linear> -> tensor<8x16xi32, #blocked1> loc(#loc73)
|
| 659 |
+
tt.store %5, %6, %tmp0_35 : tensor<8x16x!tt.ptr<i32>, #blocked1> loc(#loc73)
|
| 660 |
+
%7 = tt.splat %out_ptr3 : !tt.ptr<i32> -> tensor<8x1x!tt.ptr<i32>, #blocked> loc(#loc74)
|
| 661 |
+
%8 = tt.addptr %7, %xindex_18 : tensor<8x1x!tt.ptr<i32>, #blocked>, tensor<8x1xi32, #blocked> loc(#loc74)
|
| 662 |
+
tt.store %8, %tmp14, %xmask : tensor<8x1x!tt.ptr<i32>, #blocked> loc(#loc75)
|
| 663 |
+
tt.return loc(#loc76)
|
| 664 |
+
} loc(#loc)
|
| 665 |
+
} loc(#loc)
|
| 666 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:28)
|
| 667 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:33)
|
| 668 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:44)
|
| 669 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:23)
|
| 670 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":26:21)
|
| 671 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:38)
|
| 672 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":33:19)
|
| 673 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":34:19)
|
| 674 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:38)
|
| 675 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:35)
|
| 676 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:49)
|
| 677 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:45)
|
| 678 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:30)
|
| 679 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:54)
|
| 680 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":38:19)
|
| 681 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":40:33)
|
| 682 |
+
#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44)
|
| 683 |
+
#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60)
|
| 684 |
+
#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68)
|
| 685 |
+
#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22)
|
| 686 |
+
#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21)
|
| 687 |
+
#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40)
|
| 688 |
+
#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 689 |
+
#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 690 |
+
#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65)
|
| 691 |
+
#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78)
|
| 692 |
+
#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41)
|
| 693 |
+
#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67)
|
| 694 |
+
#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80)
|
| 695 |
+
#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30)
|
| 696 |
+
#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32)
|
| 697 |
+
#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29)
|
| 698 |
+
#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36)
|
| 699 |
+
#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23)
|
| 700 |
+
#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25)
|
| 701 |
+
#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53)
|
| 702 |
+
#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66)
|
| 703 |
+
#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37)
|
| 704 |
+
#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23)
|
| 705 |
+
#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54)
|
| 706 |
+
#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67)
|
| 707 |
+
#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36)
|
| 708 |
+
#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38)
|
| 709 |
+
#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22)
|
| 710 |
+
#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21)
|
| 711 |
+
#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40)
|
| 712 |
+
#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29)
|
| 713 |
+
#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23)
|
| 714 |
+
#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19)
|
| 715 |
+
#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28)
|
| 716 |
+
#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38)
|
| 717 |
+
#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46)
|
| 718 |
+
#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15)
|
| 719 |
+
#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48)
|
| 720 |
+
#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59)
|
| 721 |
+
#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22)
|
| 722 |
+
#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":42:19)
|
| 723 |
+
#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":44:34)
|
| 724 |
+
#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:29)
|
| 725 |
+
#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":48:21)
|
| 726 |
+
#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:35)
|
| 727 |
+
#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:32)
|
| 728 |
+
#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:25)
|
| 729 |
+
#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:47)
|
| 730 |
+
#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:25)
|
| 731 |
+
#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:37)
|
| 732 |
+
#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:4)
|
| 733 |
+
#loc82 = loc("xoffset"(#loc2))
|
| 734 |
+
#loc83 = loc("xoffset"(#loc3))
|
| 735 |
+
#loc84 = loc("xindex"(#loc4))
|
| 736 |
+
#loc85 = loc("xindex"(#loc5))
|
| 737 |
+
#loc86 = loc("xmask"(#loc6))
|
| 738 |
+
#loc87 = loc("r0_index"(#loc7))
|
| 739 |
+
#loc88 = loc("x0"(#loc8))
|
| 740 |
+
#loc89 = loc("x1"(#loc9))
|
| 741 |
+
#loc90 = loc("tmp0"(#loc10))
|
| 742 |
+
#loc91 = loc("tmp0"(#loc11))
|
| 743 |
+
#loc92 = loc("tmp0"(#loc12))
|
| 744 |
+
#loc93 = loc("tmp0"(#loc13))
|
| 745 |
+
#loc94 = loc("tmp0"(#loc14))
|
| 746 |
+
#loc95 = loc("tmp0"(#loc15))
|
| 747 |
+
#loc96 = loc("tmp2"(#loc16))
|
| 748 |
+
#loc97 = loc("tmp4"(#loc17))
|
| 749 |
+
#loc98 = loc("flip"(#loc18))
|
| 750 |
+
#loc100 = loc("flip"(#loc21))
|
| 751 |
+
#loc101 = loc("flip"(#loc22))
|
| 752 |
+
#loc102 = loc("y"(#loc23))
|
| 753 |
+
#loc103 = loc("left_mask"(#loc25))
|
| 754 |
+
#loc104 = loc("ileft"(#loc26))
|
| 755 |
+
#loc106 = loc("ileft"(#loc30))
|
| 756 |
+
#loc107 = loc("ileft"(#loc31))
|
| 757 |
+
#loc108 = loc("iright"(#loc32))
|
| 758 |
+
#loc110 = loc("iright"(#loc34))
|
| 759 |
+
#loc111 = loc("iright"(#loc35))
|
| 760 |
+
#loc112 = loc("ileft"(#loc36))
|
| 761 |
+
#loc113 = loc("iright"(#loc37))
|
| 762 |
+
#loc114 = loc("y_idx"(#loc38))
|
| 763 |
+
#loc115 = loc("left_idx"(#loc39))
|
| 764 |
+
#loc116 = loc("left_idx"(#loc40))
|
| 765 |
+
#loc117 = loc("input"(#loc41))
|
| 766 |
+
#loc119 = loc("left_idx"(#loc43))
|
| 767 |
+
#loc120 = loc("left_idx"(#loc44))
|
| 768 |
+
#loc121 = loc("right_idx"(#loc45))
|
| 769 |
+
#loc122 = loc("right_idx"(#loc46))
|
| 770 |
+
#loc124 = loc("right_idx"(#loc48))
|
| 771 |
+
#loc125 = loc("right_idx"(#loc49))
|
| 772 |
+
#loc126 = loc("left_idx"(#loc50))
|
| 773 |
+
#loc127 = loc("right_idx"(#loc51))
|
| 774 |
+
#loc128 = loc("cond"(#loc52))
|
| 775 |
+
#loc129 = loc("eq"(#loc53))
|
| 776 |
+
#loc130 = loc("cond"(#loc54))
|
| 777 |
+
#loc131 = loc("cond"(#loc55))
|
| 778 |
+
#loc132 = loc("cond"(#loc56))
|
| 779 |
+
#loc133 = loc("cond"(#loc57))
|
| 780 |
+
#loc134 = loc("cond"(#loc58))
|
| 781 |
+
#loc135 = loc("ret"(#loc59))
|
| 782 |
+
#loc136 = loc("ret"(#loc60))
|
| 783 |
+
#loc137 = loc("ret"(#loc61))
|
| 784 |
+
#loc138 = loc("new_idxs"(#loc62))
|
| 785 |
+
#loc139 = loc("new_idxs"(#loc63))
|
| 786 |
+
#loc140 = loc("new_idxs"(#loc64))
|
| 787 |
+
#loc141 = loc("tmp7"(#loc65))
|
| 788 |
+
#loc142 = loc("tmp10"(#loc66))
|
| 789 |
+
#loc144 = loc("tmp11"(#loc68))
|
| 790 |
+
#loc145 = loc("tmp14"(#loc69))
|
| 791 |
+
#loc146 = loc(callsite(#loc98 at #loc99))
|
| 792 |
+
#loc147 = loc(callsite(#loc100 at #loc99))
|
| 793 |
+
#loc148 = loc(callsite(#loc101 at #loc99))
|
| 794 |
+
#loc150 = loc("cond"(#loc128))
|
| 795 |
+
#loc151 = loc("eq"(#loc129))
|
| 796 |
+
#loc152 = loc(callsite(#loc27 at #loc143))
|
| 797 |
+
#loc154 = loc(callsite(#loc102 at #loc149))
|
| 798 |
+
#loc155 = loc(callsite(#loc103 at #loc149))
|
| 799 |
+
#loc156 = loc(callsite(#loc104 at #loc149))
|
| 800 |
+
#loc158 = loc(callsite(#loc106 at #loc149))
|
| 801 |
+
#loc159 = loc(callsite(#loc107 at #loc149))
|
| 802 |
+
#loc160 = loc(callsite(#loc108 at #loc149))
|
| 803 |
+
#loc162 = loc(callsite(#loc110 at #loc149))
|
| 804 |
+
#loc163 = loc(callsite(#loc111 at #loc149))
|
| 805 |
+
#loc164 = loc(callsite(#loc112 at #loc149))
|
| 806 |
+
#loc165 = loc(callsite(#loc113 at #loc149))
|
| 807 |
+
#loc166 = loc(callsite(#loc114 at #loc149))
|
| 808 |
+
#loc167 = loc(callsite(#loc115 at #loc149))
|
| 809 |
+
#loc168 = loc(callsite(#loc116 at #loc149))
|
| 810 |
+
#loc170 = loc(callsite(#loc119 at #loc149))
|
| 811 |
+
#loc171 = loc(callsite(#loc120 at #loc149))
|
| 812 |
+
#loc172 = loc(callsite(#loc121 at #loc149))
|
| 813 |
+
#loc173 = loc(callsite(#loc122 at #loc149))
|
| 814 |
+
#loc175 = loc(callsite(#loc124 at #loc149))
|
| 815 |
+
#loc176 = loc(callsite(#loc125 at #loc149))
|
| 816 |
+
#loc177 = loc(callsite(#loc126 at #loc149))
|
| 817 |
+
#loc178 = loc(callsite(#loc127 at #loc149))
|
| 818 |
+
#loc179 = loc(callsite(#loc150 at #loc149))
|
| 819 |
+
#loc180 = loc(callsite(#loc151 at #loc149))
|
| 820 |
+
#loc181 = loc(callsite(#loc130 at #loc149))
|
| 821 |
+
#loc182 = loc(callsite(#loc131 at #loc149))
|
| 822 |
+
#loc183 = loc(callsite(#loc132 at #loc149))
|
| 823 |
+
#loc184 = loc(callsite(#loc133 at #loc149))
|
| 824 |
+
#loc185 = loc(callsite(#loc134 at #loc149))
|
| 825 |
+
#loc186 = loc(callsite(#loc135 at #loc149))
|
| 826 |
+
#loc187 = loc(callsite(#loc136 at #loc149))
|
| 827 |
+
#loc188 = loc(callsite(#loc137 at #loc149))
|
| 828 |
+
#loc189 = loc(callsite(#loc138 at #loc149))
|
| 829 |
+
#loc190 = loc(callsite(#loc139 at #loc149))
|
| 830 |
+
#loc191 = loc(callsite(#loc140 at #loc149))
|
| 831 |
+
#loc192 = loc(callsite(#loc29 at #loc152))
|
| 832 |
+
#loc193 = loc(callsite(#loc27 at #loc157))
|
| 833 |
+
#loc195 = loc(callsite(#loc27 at #loc161))
|
| 834 |
+
#loc197 = loc(callsite(#loc117 at #loc169))
|
| 835 |
+
#loc198 = loc(callsite(#loc27 at #loc169))
|
| 836 |
+
#loc200 = loc(callsite(#loc117 at #loc174))
|
| 837 |
+
#loc201 = loc(callsite(#loc27 at #loc174))
|
| 838 |
+
#loc203 = loc(callsite(#loc29 at #loc193))
|
| 839 |
+
#loc204 = loc(callsite(#loc29 at #loc195))
|
| 840 |
+
#loc205 = loc(callsite(#loc29 at #loc198))
|
| 841 |
+
#loc206 = loc(callsite(#loc29 at #loc201))
|
SpecForge-ext/cache/compiled_kernels/triton/3/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir
ADDED
|
@@ -0,0 +1,799 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":18:0)
|
| 2 |
+
#loc1 = loc(unknown)
|
| 3 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":41:67)
|
| 4 |
+
#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12)
|
| 5 |
+
#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73)
|
| 6 |
+
#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51)
|
| 7 |
+
#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53)
|
| 8 |
+
#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50)
|
| 9 |
+
#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51)
|
| 10 |
+
#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:26)
|
| 11 |
+
#loc80 = loc("in_ptr0"(#loc))
|
| 12 |
+
#loc81 = loc("out_ptr2"(#loc))
|
| 13 |
+
#loc82 = loc("out_ptr3"(#loc))
|
| 14 |
+
#loc83 = loc("xnumel"(#loc))
|
| 15 |
+
#loc84 = loc("r0_numel"(#loc))
|
| 16 |
+
#loc106 = loc(callsite(#loc23 at #loc2))
|
| 17 |
+
#loc113 = loc("ileft"(#loc32))
|
| 18 |
+
#loc117 = loc("iright"(#loc37))
|
| 19 |
+
#loc126 = loc("left_idx"(#loc46))
|
| 20 |
+
#loc131 = loc("right_idx"(#loc51))
|
| 21 |
+
#loc150 = loc("tmp11"(#loc70))
|
| 22 |
+
#loc157 = loc(callsite(#loc28 at #loc106))
|
| 23 |
+
#loc161 = loc(callsite(#loc1 at #loc150))
|
| 24 |
+
#loc165 = loc(callsite(#loc113 at #loc157))
|
| 25 |
+
#loc169 = loc(callsite(#loc117 at #loc157))
|
| 26 |
+
#loc177 = loc(callsite(#loc126 at #loc157))
|
| 27 |
+
#loc182 = loc(callsite(#loc131 at #loc157))
|
| 28 |
+
#loc202 = loc(callsite(#loc1 at #loc165))
|
| 29 |
+
#loc204 = loc(callsite(#loc1 at #loc169))
|
| 30 |
+
#loc207 = loc(callsite(#loc1 at #loc177))
|
| 31 |
+
#loc210 = loc(callsite(#loc1 at #loc182))
|
| 32 |
+
module {
|
| 33 |
+
tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 34 |
+
%cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc85)
|
| 35 |
+
%cst_0 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc1)
|
| 36 |
+
%tmp10 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc86)
|
| 37 |
+
%tmp0 = arith.constant dense<272> : tensor<8x1xi32> loc(#loc87)
|
| 38 |
+
%tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc88)
|
| 39 |
+
%cst_2 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc1)
|
| 40 |
+
%xmask = arith.constant dense<32> : tensor<8x1xi32> loc(#loc89)
|
| 41 |
+
%c8_i32 = arith.constant 8 : i32 loc(#loc1)
|
| 42 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc90)
|
| 43 |
+
%xoffset_3 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc91)
|
| 44 |
+
%xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc92)
|
| 45 |
+
%xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc93)
|
| 46 |
+
%xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<8x1xi32> loc(#loc94)
|
| 47 |
+
%xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<8x1xi32> loc(#loc94)
|
| 48 |
+
%xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<8x1xi32> loc(#loc89)
|
| 49 |
+
%r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc95)
|
| 50 |
+
%r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc96)
|
| 51 |
+
%x0 = arith.remsi %xindex_6, %cst_2 : tensor<8x1xi32> loc(#loc97)
|
| 52 |
+
%x1 = arith.divsi %xindex_6, %cst_2 : tensor<8x1xi32> loc(#loc98)
|
| 53 |
+
%tmp0_9 = arith.muli %r0_index_8, %tmp0_1 : tensor<1x16xi32> loc(#loc88)
|
| 54 |
+
%tmp0_10 = tt.broadcast %x0 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc99)
|
| 55 |
+
%tmp0_11 = tt.broadcast %tmp0_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc99)
|
| 56 |
+
%tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<8x16xi32> loc(#loc99)
|
| 57 |
+
%tmp0_13 = arith.muli %x1, %tmp0 : tensor<8x1xi32> loc(#loc87)
|
| 58 |
+
%tmp0_14 = tt.broadcast %tmp0_13 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc100)
|
| 59 |
+
%tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<8x16xi32> loc(#loc100)
|
| 60 |
+
%tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<8x16x!tt.ptr<i32>> loc(#loc101)
|
| 61 |
+
%tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<8x16x!tt.ptr<i32>>, tensor<8x16xi32> loc(#loc101)
|
| 62 |
+
%tmp0_18 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc102)
|
| 63 |
+
%tmp0_19 = tt.load %tmp0_17, %tmp0_18, %cst_0 : tensor<8x16x!tt.ptr<i32>> loc(#loc102)
|
| 64 |
+
%tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc103)
|
| 65 |
+
%tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc104)
|
| 66 |
+
%flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc153)
|
| 67 |
+
%flip_20 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc154)
|
| 68 |
+
%flip_21 = tt.expand_dims %flip_20 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc154)
|
| 69 |
+
%flip_22 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc155)
|
| 70 |
+
%flip_23 = tt.reshape %flip_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc156)
|
| 71 |
+
%y = tt.reshape %tmp0_19 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162)
|
| 72 |
+
%left_mask = arith.subi %cst, %flip_21 : tensor<1x2x1xi32> loc(#loc163)
|
| 73 |
+
%ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc164)
|
| 74 |
+
%ileft_24 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc164)
|
| 75 |
+
%ileft_25 = "tt.reduce"(%ileft_24) <{axis = 1 : i32}> ({
|
| 76 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 77 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 78 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 79 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201)
|
| 80 |
+
%ileft_26 = tt.expand_dims %ileft_25 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166)
|
| 81 |
+
%ileft_27 = tt.broadcast %ileft_26 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167)
|
| 82 |
+
%iright = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc168)
|
| 83 |
+
%iright_28 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc168)
|
| 84 |
+
%iright_29 = "tt.reduce"(%iright_28) <{axis = 1 : i32}> ({
|
| 85 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 86 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 87 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 88 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203)
|
| 89 |
+
%iright_30 = tt.expand_dims %iright_29 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170)
|
| 90 |
+
%iright_31 = tt.broadcast %iright_30 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171)
|
| 91 |
+
%ileft_32 = tt.reshape %ileft_27 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172)
|
| 92 |
+
%iright_33 = tt.reshape %iright_31 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173)
|
| 93 |
+
%y_idx = tt.reshape %tmp4 : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc174)
|
| 94 |
+
%left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc175)
|
| 95 |
+
%left_idx_34 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc176)
|
| 96 |
+
%left_idx_35 = arith.muli %y_idx, %left_idx_34 : tensor<64x2x1xi16> loc(#loc176)
|
| 97 |
+
%input = arith.extsi %left_idx_35 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc205)
|
| 98 |
+
%left_idx_36 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
|
| 99 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 100 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 101 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 102 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206)
|
| 103 |
+
%left_idx_37 = tt.expand_dims %left_idx_36 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178)
|
| 104 |
+
%left_idx_38 = tt.broadcast %left_idx_37 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179)
|
| 105 |
+
%right_idx = arith.trunci %flip_21 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc180)
|
| 106 |
+
%right_idx_39 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc181)
|
| 107 |
+
%right_idx_40 = arith.muli %y_idx, %right_idx_39 : tensor<64x2x1xi16> loc(#loc181)
|
| 108 |
+
%input_41 = arith.extsi %right_idx_40 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc208)
|
| 109 |
+
%right_idx_42 = "tt.reduce"(%input_41) <{axis = 1 : i32}> ({
|
| 110 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 111 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 112 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 113 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209)
|
| 114 |
+
%right_idx_43 = tt.expand_dims %right_idx_42 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183)
|
| 115 |
+
%right_idx_44 = tt.broadcast %right_idx_43 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184)
|
| 116 |
+
%left_idx_45 = tt.reshape %left_idx_38 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185)
|
| 117 |
+
%right_idx_46 = tt.reshape %right_idx_44 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186)
|
| 118 |
+
%cond = arith.cmpi slt, %ileft_32, %iright_33 : tensor<8x16xi32> loc(#loc187)
|
| 119 |
+
%eq = arith.cmpi eq, %ileft_32, %iright_33 : tensor<8x16xi32> loc(#loc188)
|
| 120 |
+
%cond_47 = arith.cmpi sgt, %left_idx_45, %right_idx_46 : tensor<8x16xi32> loc(#loc189)
|
| 121 |
+
%cond_48 = arith.andi %eq, %cond_47 : tensor<8x16xi1> loc(#loc190)
|
| 122 |
+
%cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc191)
|
| 123 |
+
%cond_50 = arith.extui %cond_49 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
|
| 124 |
+
%cond_51 = arith.xori %cond_50, %flip_23 : tensor<8x16xi32> loc(#loc192)
|
| 125 |
+
%cond_52 = arith.cmpi ne, %cond_51, %cst_0 : tensor<8x16xi32> loc(#loc193)
|
| 126 |
+
%ret = arith.xori %ileft_32, %iright_33 : tensor<8x16xi32> loc(#loc194)
|
| 127 |
+
%ret_53 = arith.select %cond_52, %ret, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
|
| 128 |
+
%ret_54 = arith.xori %tmp0_19, %ret_53 : tensor<8x16xi32> loc(#loc196)
|
| 129 |
+
%new_idxs = arith.xori %left_idx_45, %right_idx_46 : tensor<8x16xi32> loc(#loc197)
|
| 130 |
+
%new_idxs_55 = arith.select %cond_52, %new_idxs, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
|
| 131 |
+
%new_idxs_56 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc199)
|
| 132 |
+
%new_idxs_57 = tt.broadcast %new_idxs_56 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc199)
|
| 133 |
+
%new_idxs_58 = arith.xori %new_idxs_57, %new_idxs_55 : tensor<8x16xi32> loc(#loc199)
|
| 134 |
+
%flip_59 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc155)
|
| 135 |
+
%flip_60 = tt.reshape %flip_59 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc156)
|
| 136 |
+
%y_61 = tt.reshape %ret_54 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc162)
|
| 137 |
+
%ileft_62 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc164)
|
| 138 |
+
%ileft_63 = arith.muli %y_61, %ileft_62 : tensor<32x2x2xi32> loc(#loc164)
|
| 139 |
+
%ileft_64 = "tt.reduce"(%ileft_63) <{axis = 1 : i32}> ({
|
| 140 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 141 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 142 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 143 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc201)
|
| 144 |
+
%ileft_65 = tt.expand_dims %ileft_64 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc166)
|
| 145 |
+
%ileft_66 = tt.broadcast %ileft_65 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc167)
|
| 146 |
+
%iright_67 = arith.muli %y_61, %flip_22 : tensor<32x2x2xi32> loc(#loc168)
|
| 147 |
+
%iright_68 = "tt.reduce"(%iright_67) <{axis = 1 : i32}> ({
|
| 148 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 149 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 150 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 151 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc203)
|
| 152 |
+
%iright_69 = tt.expand_dims %iright_68 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc170)
|
| 153 |
+
%iright_70 = tt.broadcast %iright_69 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc171)
|
| 154 |
+
%ileft_71 = tt.reshape %ileft_66 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc172)
|
| 155 |
+
%iright_72 = tt.reshape %iright_70 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc173)
|
| 156 |
+
%y_idx_73 = tt.reshape %new_idxs_58 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc174)
|
| 157 |
+
%left_idx_74 = arith.muli %y_idx_73, %ileft_62 : tensor<32x2x2xi32> loc(#loc176)
|
| 158 |
+
%left_idx_75 = "tt.reduce"(%left_idx_74) <{axis = 1 : i32}> ({
|
| 159 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 160 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 161 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 162 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc206)
|
| 163 |
+
%left_idx_76 = tt.expand_dims %left_idx_75 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc178)
|
| 164 |
+
%left_idx_77 = tt.broadcast %left_idx_76 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc179)
|
| 165 |
+
%right_idx_78 = arith.muli %y_idx_73, %flip_22 : tensor<32x2x2xi32> loc(#loc181)
|
| 166 |
+
%right_idx_79 = "tt.reduce"(%right_idx_78) <{axis = 1 : i32}> ({
|
| 167 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 168 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 169 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 170 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc209)
|
| 171 |
+
%right_idx_80 = tt.expand_dims %right_idx_79 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc183)
|
| 172 |
+
%right_idx_81 = tt.broadcast %right_idx_80 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc184)
|
| 173 |
+
%left_idx_82 = tt.reshape %left_idx_77 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc185)
|
| 174 |
+
%right_idx_83 = tt.reshape %right_idx_81 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc186)
|
| 175 |
+
%cond_84 = arith.cmpi slt, %ileft_71, %iright_72 : tensor<8x16xi32> loc(#loc187)
|
| 176 |
+
%eq_85 = arith.cmpi eq, %ileft_71, %iright_72 : tensor<8x16xi32> loc(#loc188)
|
| 177 |
+
%cond_86 = arith.cmpi sgt, %left_idx_82, %right_idx_83 : tensor<8x16xi32> loc(#loc189)
|
| 178 |
+
%cond_87 = arith.andi %eq_85, %cond_86 : tensor<8x16xi1> loc(#loc190)
|
| 179 |
+
%cond_88 = arith.ori %cond_84, %cond_87 : tensor<8x16xi1> loc(#loc191)
|
| 180 |
+
%cond_89 = arith.extui %cond_88 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
|
| 181 |
+
%cond_90 = arith.xori %cond_89, %flip_60 : tensor<8x16xi32> loc(#loc192)
|
| 182 |
+
%cond_91 = arith.cmpi ne, %cond_90, %cst_0 : tensor<8x16xi32> loc(#loc193)
|
| 183 |
+
%ret_92 = arith.xori %ileft_71, %iright_72 : tensor<8x16xi32> loc(#loc194)
|
| 184 |
+
%ret_93 = arith.select %cond_91, %ret_92, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
|
| 185 |
+
%ret_94 = arith.xori %ret_54, %ret_93 : tensor<8x16xi32> loc(#loc196)
|
| 186 |
+
%new_idxs_95 = arith.xori %left_idx_82, %right_idx_83 : tensor<8x16xi32> loc(#loc197)
|
| 187 |
+
%new_idxs_96 = arith.select %cond_91, %new_idxs_95, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
|
| 188 |
+
%new_idxs_97 = arith.xori %new_idxs_58, %new_idxs_96 : tensor<8x16xi32> loc(#loc199)
|
| 189 |
+
%y_98 = tt.reshape %ret_94 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162)
|
| 190 |
+
%ileft_99 = arith.muli %y_98, %ileft : tensor<64x2x1xi32> loc(#loc164)
|
| 191 |
+
%ileft_100 = "tt.reduce"(%ileft_99) <{axis = 1 : i32}> ({
|
| 192 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 193 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 194 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 195 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201)
|
| 196 |
+
%ileft_101 = tt.expand_dims %ileft_100 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166)
|
| 197 |
+
%ileft_102 = tt.broadcast %ileft_101 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167)
|
| 198 |
+
%iright_103 = arith.muli %y_98, %iright : tensor<64x2x1xi32> loc(#loc168)
|
| 199 |
+
%iright_104 = "tt.reduce"(%iright_103) <{axis = 1 : i32}> ({
|
| 200 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 201 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 202 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 203 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203)
|
| 204 |
+
%iright_105 = tt.expand_dims %iright_104 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170)
|
| 205 |
+
%iright_106 = tt.broadcast %iright_105 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171)
|
| 206 |
+
%ileft_107 = tt.reshape %ileft_102 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172)
|
| 207 |
+
%iright_108 = tt.reshape %iright_106 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173)
|
| 208 |
+
%y_idx_109 = tt.reshape %new_idxs_97 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc174)
|
| 209 |
+
%left_idx_110 = arith.muli %y_idx_109, %ileft : tensor<64x2x1xi32> loc(#loc176)
|
| 210 |
+
%left_idx_111 = "tt.reduce"(%left_idx_110) <{axis = 1 : i32}> ({
|
| 211 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 212 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 213 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 214 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206)
|
| 215 |
+
%left_idx_112 = tt.expand_dims %left_idx_111 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178)
|
| 216 |
+
%left_idx_113 = tt.broadcast %left_idx_112 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179)
|
| 217 |
+
%right_idx_114 = arith.muli %y_idx_109, %iright : tensor<64x2x1xi32> loc(#loc181)
|
| 218 |
+
%right_idx_115 = "tt.reduce"(%right_idx_114) <{axis = 1 : i32}> ({
|
| 219 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 220 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 221 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 222 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209)
|
| 223 |
+
%right_idx_116 = tt.expand_dims %right_idx_115 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183)
|
| 224 |
+
%right_idx_117 = tt.broadcast %right_idx_116 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184)
|
| 225 |
+
%left_idx_118 = tt.reshape %left_idx_113 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185)
|
| 226 |
+
%right_idx_119 = tt.reshape %right_idx_117 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186)
|
| 227 |
+
%cond_120 = arith.cmpi slt, %ileft_107, %iright_108 : tensor<8x16xi32> loc(#loc187)
|
| 228 |
+
%eq_121 = arith.cmpi eq, %ileft_107, %iright_108 : tensor<8x16xi32> loc(#loc188)
|
| 229 |
+
%cond_122 = arith.cmpi sgt, %left_idx_118, %right_idx_119 : tensor<8x16xi32> loc(#loc189)
|
| 230 |
+
%cond_123 = arith.andi %eq_121, %cond_122 : tensor<8x16xi1> loc(#loc190)
|
| 231 |
+
%cond_124 = arith.ori %cond_120, %cond_123 : tensor<8x16xi1> loc(#loc191)
|
| 232 |
+
%cond_125 = arith.extui %cond_124 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
|
| 233 |
+
%cond_126 = arith.xori %cond_125, %flip_60 : tensor<8x16xi32> loc(#loc192)
|
| 234 |
+
%cond_127 = arith.cmpi ne, %cond_126, %cst_0 : tensor<8x16xi32> loc(#loc193)
|
| 235 |
+
%ret_128 = arith.xori %ileft_107, %iright_108 : tensor<8x16xi32> loc(#loc194)
|
| 236 |
+
%ret_129 = arith.select %cond_127, %ret_128, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
|
| 237 |
+
%ret_130 = arith.xori %ret_94, %ret_129 : tensor<8x16xi32> loc(#loc196)
|
| 238 |
+
%new_idxs_131 = arith.xori %left_idx_118, %right_idx_119 : tensor<8x16xi32> loc(#loc197)
|
| 239 |
+
%new_idxs_132 = arith.select %cond_127, %new_idxs_131, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
|
| 240 |
+
%new_idxs_133 = arith.xori %new_idxs_97, %new_idxs_132 : tensor<8x16xi32> loc(#loc199)
|
| 241 |
+
%flip_134 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc155)
|
| 242 |
+
%flip_135 = tt.reshape %flip_134 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc156)
|
| 243 |
+
%y_136 = tt.reshape %ret_130 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc162)
|
| 244 |
+
%ileft_137 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc164)
|
| 245 |
+
%ileft_138 = arith.muli %y_136, %ileft_137 : tensor<16x2x4xi32> loc(#loc164)
|
| 246 |
+
%ileft_139 = "tt.reduce"(%ileft_138) <{axis = 1 : i32}> ({
|
| 247 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 248 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 249 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 250 |
+
}) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc201)
|
| 251 |
+
%ileft_140 = tt.expand_dims %ileft_139 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc166)
|
| 252 |
+
%ileft_141 = tt.broadcast %ileft_140 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc167)
|
| 253 |
+
%iright_142 = arith.muli %y_136, %flip_59 : tensor<16x2x4xi32> loc(#loc168)
|
| 254 |
+
%iright_143 = "tt.reduce"(%iright_142) <{axis = 1 : i32}> ({
|
| 255 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 256 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 257 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 258 |
+
}) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc203)
|
| 259 |
+
%iright_144 = tt.expand_dims %iright_143 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc170)
|
| 260 |
+
%iright_145 = tt.broadcast %iright_144 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc171)
|
| 261 |
+
%ileft_146 = tt.reshape %ileft_141 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc172)
|
| 262 |
+
%iright_147 = tt.reshape %iright_145 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc173)
|
| 263 |
+
%y_idx_148 = tt.reshape %new_idxs_133 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc174)
|
| 264 |
+
%left_idx_149 = arith.muli %y_idx_148, %ileft_137 : tensor<16x2x4xi32> loc(#loc176)
|
| 265 |
+
%left_idx_150 = "tt.reduce"(%left_idx_149) <{axis = 1 : i32}> ({
|
| 266 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 267 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 268 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 269 |
+
}) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc206)
|
| 270 |
+
%left_idx_151 = tt.expand_dims %left_idx_150 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc178)
|
| 271 |
+
%left_idx_152 = tt.broadcast %left_idx_151 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc179)
|
| 272 |
+
%right_idx_153 = arith.muli %y_idx_148, %flip_59 : tensor<16x2x4xi32> loc(#loc181)
|
| 273 |
+
%right_idx_154 = "tt.reduce"(%right_idx_153) <{axis = 1 : i32}> ({
|
| 274 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 275 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 276 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 277 |
+
}) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc209)
|
| 278 |
+
%right_idx_155 = tt.expand_dims %right_idx_154 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc183)
|
| 279 |
+
%right_idx_156 = tt.broadcast %right_idx_155 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc184)
|
| 280 |
+
%left_idx_157 = tt.reshape %left_idx_152 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc185)
|
| 281 |
+
%right_idx_158 = tt.reshape %right_idx_156 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc186)
|
| 282 |
+
%cond_159 = arith.cmpi slt, %ileft_146, %iright_147 : tensor<8x16xi32> loc(#loc187)
|
| 283 |
+
%eq_160 = arith.cmpi eq, %ileft_146, %iright_147 : tensor<8x16xi32> loc(#loc188)
|
| 284 |
+
%cond_161 = arith.cmpi sgt, %left_idx_157, %right_idx_158 : tensor<8x16xi32> loc(#loc189)
|
| 285 |
+
%cond_162 = arith.andi %eq_160, %cond_161 : tensor<8x16xi1> loc(#loc190)
|
| 286 |
+
%cond_163 = arith.ori %cond_159, %cond_162 : tensor<8x16xi1> loc(#loc191)
|
| 287 |
+
%cond_164 = arith.extui %cond_163 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
|
| 288 |
+
%cond_165 = arith.xori %cond_164, %flip_135 : tensor<8x16xi32> loc(#loc192)
|
| 289 |
+
%cond_166 = arith.cmpi ne, %cond_165, %cst_0 : tensor<8x16xi32> loc(#loc193)
|
| 290 |
+
%ret_167 = arith.xori %ileft_146, %iright_147 : tensor<8x16xi32> loc(#loc194)
|
| 291 |
+
%ret_168 = arith.select %cond_166, %ret_167, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
|
| 292 |
+
%ret_169 = arith.xori %ret_130, %ret_168 : tensor<8x16xi32> loc(#loc196)
|
| 293 |
+
%new_idxs_170 = arith.xori %left_idx_157, %right_idx_158 : tensor<8x16xi32> loc(#loc197)
|
| 294 |
+
%new_idxs_171 = arith.select %cond_166, %new_idxs_170, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
|
| 295 |
+
%new_idxs_172 = arith.xori %new_idxs_133, %new_idxs_171 : tensor<8x16xi32> loc(#loc199)
|
| 296 |
+
%y_173 = tt.reshape %ret_169 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc162)
|
| 297 |
+
%ileft_174 = arith.muli %y_173, %ileft_62 : tensor<32x2x2xi32> loc(#loc164)
|
| 298 |
+
%ileft_175 = "tt.reduce"(%ileft_174) <{axis = 1 : i32}> ({
|
| 299 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 300 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 301 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 302 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc201)
|
| 303 |
+
%ileft_176 = tt.expand_dims %ileft_175 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc166)
|
| 304 |
+
%ileft_177 = tt.broadcast %ileft_176 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc167)
|
| 305 |
+
%iright_178 = arith.muli %y_173, %flip_22 : tensor<32x2x2xi32> loc(#loc168)
|
| 306 |
+
%iright_179 = "tt.reduce"(%iright_178) <{axis = 1 : i32}> ({
|
| 307 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 308 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 309 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 310 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc203)
|
| 311 |
+
%iright_180 = tt.expand_dims %iright_179 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc170)
|
| 312 |
+
%iright_181 = tt.broadcast %iright_180 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc171)
|
| 313 |
+
%ileft_182 = tt.reshape %ileft_177 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc172)
|
| 314 |
+
%iright_183 = tt.reshape %iright_181 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc173)
|
| 315 |
+
%y_idx_184 = tt.reshape %new_idxs_172 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc174)
|
| 316 |
+
%left_idx_185 = arith.muli %y_idx_184, %ileft_62 : tensor<32x2x2xi32> loc(#loc176)
|
| 317 |
+
%left_idx_186 = "tt.reduce"(%left_idx_185) <{axis = 1 : i32}> ({
|
| 318 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 319 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 320 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 321 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc206)
|
| 322 |
+
%left_idx_187 = tt.expand_dims %left_idx_186 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc178)
|
| 323 |
+
%left_idx_188 = tt.broadcast %left_idx_187 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc179)
|
| 324 |
+
%right_idx_189 = arith.muli %y_idx_184, %flip_22 : tensor<32x2x2xi32> loc(#loc181)
|
| 325 |
+
%right_idx_190 = "tt.reduce"(%right_idx_189) <{axis = 1 : i32}> ({
|
| 326 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 327 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 328 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 329 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc209)
|
| 330 |
+
%right_idx_191 = tt.expand_dims %right_idx_190 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc183)
|
| 331 |
+
%right_idx_192 = tt.broadcast %right_idx_191 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc184)
|
| 332 |
+
%left_idx_193 = tt.reshape %left_idx_188 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc185)
|
| 333 |
+
%right_idx_194 = tt.reshape %right_idx_192 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc186)
|
| 334 |
+
%cond_195 = arith.cmpi slt, %ileft_182, %iright_183 : tensor<8x16xi32> loc(#loc187)
|
| 335 |
+
%eq_196 = arith.cmpi eq, %ileft_182, %iright_183 : tensor<8x16xi32> loc(#loc188)
|
| 336 |
+
%cond_197 = arith.cmpi sgt, %left_idx_193, %right_idx_194 : tensor<8x16xi32> loc(#loc189)
|
| 337 |
+
%cond_198 = arith.andi %eq_196, %cond_197 : tensor<8x16xi1> loc(#loc190)
|
| 338 |
+
%cond_199 = arith.ori %cond_195, %cond_198 : tensor<8x16xi1> loc(#loc191)
|
| 339 |
+
%cond_200 = arith.extui %cond_199 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
|
| 340 |
+
%cond_201 = arith.xori %cond_200, %flip_135 : tensor<8x16xi32> loc(#loc192)
|
| 341 |
+
%cond_202 = arith.cmpi ne, %cond_201, %cst_0 : tensor<8x16xi32> loc(#loc193)
|
| 342 |
+
%ret_203 = arith.xori %ileft_182, %iright_183 : tensor<8x16xi32> loc(#loc194)
|
| 343 |
+
%ret_204 = arith.select %cond_202, %ret_203, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
|
| 344 |
+
%ret_205 = arith.xori %ret_169, %ret_204 : tensor<8x16xi32> loc(#loc196)
|
| 345 |
+
%new_idxs_206 = arith.xori %left_idx_193, %right_idx_194 : tensor<8x16xi32> loc(#loc197)
|
| 346 |
+
%new_idxs_207 = arith.select %cond_202, %new_idxs_206, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
|
| 347 |
+
%new_idxs_208 = arith.xori %new_idxs_172, %new_idxs_207 : tensor<8x16xi32> loc(#loc199)
|
| 348 |
+
%y_209 = tt.reshape %ret_205 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162)
|
| 349 |
+
%ileft_210 = arith.muli %y_209, %ileft : tensor<64x2x1xi32> loc(#loc164)
|
| 350 |
+
%ileft_211 = "tt.reduce"(%ileft_210) <{axis = 1 : i32}> ({
|
| 351 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 352 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 353 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 354 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201)
|
| 355 |
+
%ileft_212 = tt.expand_dims %ileft_211 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166)
|
| 356 |
+
%ileft_213 = tt.broadcast %ileft_212 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167)
|
| 357 |
+
%iright_214 = arith.muli %y_209, %iright : tensor<64x2x1xi32> loc(#loc168)
|
| 358 |
+
%iright_215 = "tt.reduce"(%iright_214) <{axis = 1 : i32}> ({
|
| 359 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 360 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 361 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 362 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203)
|
| 363 |
+
%iright_216 = tt.expand_dims %iright_215 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170)
|
| 364 |
+
%iright_217 = tt.broadcast %iright_216 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171)
|
| 365 |
+
%ileft_218 = tt.reshape %ileft_213 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172)
|
| 366 |
+
%iright_219 = tt.reshape %iright_217 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173)
|
| 367 |
+
%y_idx_220 = tt.reshape %new_idxs_208 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc174)
|
| 368 |
+
%left_idx_221 = arith.muli %y_idx_220, %ileft : tensor<64x2x1xi32> loc(#loc176)
|
| 369 |
+
%left_idx_222 = "tt.reduce"(%left_idx_221) <{axis = 1 : i32}> ({
|
| 370 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 371 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 372 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 373 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206)
|
| 374 |
+
%left_idx_223 = tt.expand_dims %left_idx_222 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178)
|
| 375 |
+
%left_idx_224 = tt.broadcast %left_idx_223 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179)
|
| 376 |
+
%right_idx_225 = arith.muli %y_idx_220, %iright : tensor<64x2x1xi32> loc(#loc181)
|
| 377 |
+
%right_idx_226 = "tt.reduce"(%right_idx_225) <{axis = 1 : i32}> ({
|
| 378 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 379 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 380 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 381 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209)
|
| 382 |
+
%right_idx_227 = tt.expand_dims %right_idx_226 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183)
|
| 383 |
+
%right_idx_228 = tt.broadcast %right_idx_227 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184)
|
| 384 |
+
%left_idx_229 = tt.reshape %left_idx_224 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185)
|
| 385 |
+
%right_idx_230 = tt.reshape %right_idx_228 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186)
|
| 386 |
+
%cond_231 = arith.cmpi slt, %ileft_218, %iright_219 : tensor<8x16xi32> loc(#loc187)
|
| 387 |
+
%eq_232 = arith.cmpi eq, %ileft_218, %iright_219 : tensor<8x16xi32> loc(#loc188)
|
| 388 |
+
%cond_233 = arith.cmpi sgt, %left_idx_229, %right_idx_230 : tensor<8x16xi32> loc(#loc189)
|
| 389 |
+
%cond_234 = arith.andi %eq_232, %cond_233 : tensor<8x16xi1> loc(#loc190)
|
| 390 |
+
%cond_235 = arith.ori %cond_231, %cond_234 : tensor<8x16xi1> loc(#loc191)
|
| 391 |
+
%cond_236 = arith.extui %cond_235 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192)
|
| 392 |
+
%cond_237 = arith.xori %cond_236, %flip_135 : tensor<8x16xi32> loc(#loc192)
|
| 393 |
+
%cond_238 = arith.cmpi ne, %cond_237, %cst_0 : tensor<8x16xi32> loc(#loc193)
|
| 394 |
+
%ret_239 = arith.xori %ileft_218, %iright_219 : tensor<8x16xi32> loc(#loc194)
|
| 395 |
+
%ret_240 = arith.select %cond_238, %ret_239, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
|
| 396 |
+
%ret_241 = arith.xori %ret_205, %ret_240 : tensor<8x16xi32> loc(#loc196)
|
| 397 |
+
%new_idxs_242 = arith.xori %left_idx_229, %right_idx_230 : tensor<8x16xi32> loc(#loc197)
|
| 398 |
+
%new_idxs_243 = arith.select %cond_238, %new_idxs_242, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
|
| 399 |
+
%new_idxs_244 = arith.xori %new_idxs_208, %new_idxs_243 : tensor<8x16xi32> loc(#loc199)
|
| 400 |
+
%y_245 = tt.reshape %ret_241 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc162)
|
| 401 |
+
%ileft_246 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc164)
|
| 402 |
+
%ileft_247 = arith.muli %y_245, %ileft_246 : tensor<8x2x8xi32> loc(#loc164)
|
| 403 |
+
%ileft_248 = "tt.reduce"(%ileft_247) <{axis = 1 : i32}> ({
|
| 404 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 405 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 406 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 407 |
+
}) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc201)
|
| 408 |
+
%ileft_249 = tt.expand_dims %ileft_248 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc166)
|
| 409 |
+
%ileft_250 = tt.broadcast %ileft_249 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc167)
|
| 410 |
+
%iright_251 = arith.muli %y_245, %flip_134 : tensor<8x2x8xi32> loc(#loc168)
|
| 411 |
+
%iright_252 = "tt.reduce"(%iright_251) <{axis = 1 : i32}> ({
|
| 412 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 413 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 414 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 415 |
+
}) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc203)
|
| 416 |
+
%iright_253 = tt.expand_dims %iright_252 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc170)
|
| 417 |
+
%iright_254 = tt.broadcast %iright_253 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc171)
|
| 418 |
+
%ileft_255 = tt.reshape %ileft_250 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc172)
|
| 419 |
+
%iright_256 = tt.reshape %iright_254 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc173)
|
| 420 |
+
%y_idx_257 = tt.reshape %new_idxs_244 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc174)
|
| 421 |
+
%left_idx_258 = arith.muli %y_idx_257, %ileft_246 : tensor<8x2x8xi32> loc(#loc176)
|
| 422 |
+
%left_idx_259 = "tt.reduce"(%left_idx_258) <{axis = 1 : i32}> ({
|
| 423 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 424 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 425 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 426 |
+
}) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc206)
|
| 427 |
+
%left_idx_260 = tt.expand_dims %left_idx_259 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc178)
|
| 428 |
+
%left_idx_261 = tt.broadcast %left_idx_260 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc179)
|
| 429 |
+
%right_idx_262 = arith.muli %y_idx_257, %flip_134 : tensor<8x2x8xi32> loc(#loc181)
|
| 430 |
+
%right_idx_263 = "tt.reduce"(%right_idx_262) <{axis = 1 : i32}> ({
|
| 431 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 432 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 433 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 434 |
+
}) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc209)
|
| 435 |
+
%right_idx_264 = tt.expand_dims %right_idx_263 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc183)
|
| 436 |
+
%right_idx_265 = tt.broadcast %right_idx_264 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc184)
|
| 437 |
+
%left_idx_266 = tt.reshape %left_idx_261 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc185)
|
| 438 |
+
%right_idx_267 = tt.reshape %right_idx_265 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc186)
|
| 439 |
+
%cond_268 = arith.cmpi slt, %ileft_255, %iright_256 : tensor<8x16xi32> loc(#loc187)
|
| 440 |
+
%eq_269 = arith.cmpi eq, %ileft_255, %iright_256 : tensor<8x16xi32> loc(#loc188)
|
| 441 |
+
%cond_270 = arith.cmpi sgt, %left_idx_266, %right_idx_267 : tensor<8x16xi32> loc(#loc189)
|
| 442 |
+
%cond_271 = arith.andi %eq_269, %cond_270 : tensor<8x16xi1> loc(#loc190)
|
| 443 |
+
%cond_272 = arith.ori %cond_268, %cond_271 : tensor<8x16xi1> loc(#loc191)
|
| 444 |
+
%ret_273 = arith.xori %ileft_255, %iright_256 : tensor<8x16xi32> loc(#loc194)
|
| 445 |
+
%ret_274 = arith.select %cond_272, %ret_273, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
|
| 446 |
+
%ret_275 = arith.xori %ret_241, %ret_274 : tensor<8x16xi32> loc(#loc196)
|
| 447 |
+
%new_idxs_276 = arith.xori %left_idx_266, %right_idx_267 : tensor<8x16xi32> loc(#loc197)
|
| 448 |
+
%new_idxs_277 = arith.select %cond_272, %new_idxs_276, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
|
| 449 |
+
%new_idxs_278 = arith.xori %new_idxs_244, %new_idxs_277 : tensor<8x16xi32> loc(#loc199)
|
| 450 |
+
%y_279 = tt.reshape %ret_275 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc162)
|
| 451 |
+
%ileft_280 = arith.muli %y_279, %ileft_137 : tensor<16x2x4xi32> loc(#loc164)
|
| 452 |
+
%ileft_281 = "tt.reduce"(%ileft_280) <{axis = 1 : i32}> ({
|
| 453 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 454 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 455 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 456 |
+
}) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc201)
|
| 457 |
+
%ileft_282 = tt.expand_dims %ileft_281 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc166)
|
| 458 |
+
%ileft_283 = tt.broadcast %ileft_282 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc167)
|
| 459 |
+
%iright_284 = arith.muli %y_279, %flip_59 : tensor<16x2x4xi32> loc(#loc168)
|
| 460 |
+
%iright_285 = "tt.reduce"(%iright_284) <{axis = 1 : i32}> ({
|
| 461 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 462 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 463 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 464 |
+
}) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc203)
|
| 465 |
+
%iright_286 = tt.expand_dims %iright_285 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc170)
|
| 466 |
+
%iright_287 = tt.broadcast %iright_286 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc171)
|
| 467 |
+
%ileft_288 = tt.reshape %ileft_283 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc172)
|
| 468 |
+
%iright_289 = tt.reshape %iright_287 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc173)
|
| 469 |
+
%y_idx_290 = tt.reshape %new_idxs_278 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc174)
|
| 470 |
+
%left_idx_291 = arith.muli %y_idx_290, %ileft_137 : tensor<16x2x4xi32> loc(#loc176)
|
| 471 |
+
%left_idx_292 = "tt.reduce"(%left_idx_291) <{axis = 1 : i32}> ({
|
| 472 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 473 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 474 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 475 |
+
}) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc206)
|
| 476 |
+
%left_idx_293 = tt.expand_dims %left_idx_292 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc178)
|
| 477 |
+
%left_idx_294 = tt.broadcast %left_idx_293 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc179)
|
| 478 |
+
%right_idx_295 = arith.muli %y_idx_290, %flip_59 : tensor<16x2x4xi32> loc(#loc181)
|
| 479 |
+
%right_idx_296 = "tt.reduce"(%right_idx_295) <{axis = 1 : i32}> ({
|
| 480 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 481 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 482 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 483 |
+
}) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc209)
|
| 484 |
+
%right_idx_297 = tt.expand_dims %right_idx_296 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc183)
|
| 485 |
+
%right_idx_298 = tt.broadcast %right_idx_297 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc184)
|
| 486 |
+
%left_idx_299 = tt.reshape %left_idx_294 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc185)
|
| 487 |
+
%right_idx_300 = tt.reshape %right_idx_298 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc186)
|
| 488 |
+
%cond_301 = arith.cmpi slt, %ileft_288, %iright_289 : tensor<8x16xi32> loc(#loc187)
|
| 489 |
+
%eq_302 = arith.cmpi eq, %ileft_288, %iright_289 : tensor<8x16xi32> loc(#loc188)
|
| 490 |
+
%cond_303 = arith.cmpi sgt, %left_idx_299, %right_idx_300 : tensor<8x16xi32> loc(#loc189)
|
| 491 |
+
%cond_304 = arith.andi %eq_302, %cond_303 : tensor<8x16xi1> loc(#loc190)
|
| 492 |
+
%cond_305 = arith.ori %cond_301, %cond_304 : tensor<8x16xi1> loc(#loc191)
|
| 493 |
+
%ret_306 = arith.xori %ileft_288, %iright_289 : tensor<8x16xi32> loc(#loc194)
|
| 494 |
+
%ret_307 = arith.select %cond_305, %ret_306, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
|
| 495 |
+
%ret_308 = arith.xori %ret_275, %ret_307 : tensor<8x16xi32> loc(#loc196)
|
| 496 |
+
%new_idxs_309 = arith.xori %left_idx_299, %right_idx_300 : tensor<8x16xi32> loc(#loc197)
|
| 497 |
+
%new_idxs_310 = arith.select %cond_305, %new_idxs_309, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
|
| 498 |
+
%new_idxs_311 = arith.xori %new_idxs_278, %new_idxs_310 : tensor<8x16xi32> loc(#loc199)
|
| 499 |
+
%y_312 = tt.reshape %ret_308 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc162)
|
| 500 |
+
%ileft_313 = arith.muli %y_312, %ileft_62 : tensor<32x2x2xi32> loc(#loc164)
|
| 501 |
+
%ileft_314 = "tt.reduce"(%ileft_313) <{axis = 1 : i32}> ({
|
| 502 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 503 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 504 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 505 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc201)
|
| 506 |
+
%ileft_315 = tt.expand_dims %ileft_314 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc166)
|
| 507 |
+
%ileft_316 = tt.broadcast %ileft_315 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc167)
|
| 508 |
+
%iright_317 = arith.muli %y_312, %flip_22 : tensor<32x2x2xi32> loc(#loc168)
|
| 509 |
+
%iright_318 = "tt.reduce"(%iright_317) <{axis = 1 : i32}> ({
|
| 510 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 511 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 512 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 513 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc203)
|
| 514 |
+
%iright_319 = tt.expand_dims %iright_318 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc170)
|
| 515 |
+
%iright_320 = tt.broadcast %iright_319 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc171)
|
| 516 |
+
%ileft_321 = tt.reshape %ileft_316 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc172)
|
| 517 |
+
%iright_322 = tt.reshape %iright_320 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc173)
|
| 518 |
+
%y_idx_323 = tt.reshape %new_idxs_311 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc174)
|
| 519 |
+
%left_idx_324 = arith.muli %y_idx_323, %ileft_62 : tensor<32x2x2xi32> loc(#loc176)
|
| 520 |
+
%left_idx_325 = "tt.reduce"(%left_idx_324) <{axis = 1 : i32}> ({
|
| 521 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 522 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 523 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 524 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc206)
|
| 525 |
+
%left_idx_326 = tt.expand_dims %left_idx_325 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc178)
|
| 526 |
+
%left_idx_327 = tt.broadcast %left_idx_326 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc179)
|
| 527 |
+
%right_idx_328 = arith.muli %y_idx_323, %flip_22 : tensor<32x2x2xi32> loc(#loc181)
|
| 528 |
+
%right_idx_329 = "tt.reduce"(%right_idx_328) <{axis = 1 : i32}> ({
|
| 529 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 530 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 531 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 532 |
+
}) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc209)
|
| 533 |
+
%right_idx_330 = tt.expand_dims %right_idx_329 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc183)
|
| 534 |
+
%right_idx_331 = tt.broadcast %right_idx_330 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc184)
|
| 535 |
+
%left_idx_332 = tt.reshape %left_idx_327 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc185)
|
| 536 |
+
%right_idx_333 = tt.reshape %right_idx_331 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc186)
|
| 537 |
+
%cond_334 = arith.cmpi slt, %ileft_321, %iright_322 : tensor<8x16xi32> loc(#loc187)
|
| 538 |
+
%eq_335 = arith.cmpi eq, %ileft_321, %iright_322 : tensor<8x16xi32> loc(#loc188)
|
| 539 |
+
%cond_336 = arith.cmpi sgt, %left_idx_332, %right_idx_333 : tensor<8x16xi32> loc(#loc189)
|
| 540 |
+
%cond_337 = arith.andi %eq_335, %cond_336 : tensor<8x16xi1> loc(#loc190)
|
| 541 |
+
%cond_338 = arith.ori %cond_334, %cond_337 : tensor<8x16xi1> loc(#loc191)
|
| 542 |
+
%ret_339 = arith.xori %ileft_321, %iright_322 : tensor<8x16xi32> loc(#loc194)
|
| 543 |
+
%ret_340 = arith.select %cond_338, %ret_339, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195)
|
| 544 |
+
%ret_341 = arith.xori %ret_308, %ret_340 : tensor<8x16xi32> loc(#loc196)
|
| 545 |
+
%new_idxs_342 = arith.xori %left_idx_332, %right_idx_333 : tensor<8x16xi32> loc(#loc197)
|
| 546 |
+
%new_idxs_343 = arith.select %cond_338, %new_idxs_342, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
|
| 547 |
+
%new_idxs_344 = arith.xori %new_idxs_311, %new_idxs_343 : tensor<8x16xi32> loc(#loc199)
|
| 548 |
+
%y_345 = tt.reshape %ret_341 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162)
|
| 549 |
+
%ileft_346 = arith.muli %y_345, %ileft : tensor<64x2x1xi32> loc(#loc164)
|
| 550 |
+
%ileft_347 = "tt.reduce"(%ileft_346) <{axis = 1 : i32}> ({
|
| 551 |
+
^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
|
| 552 |
+
%ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
|
| 553 |
+
tt.reduce.return %ileft_379 : i32 loc(#loc201)
|
| 554 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201)
|
| 555 |
+
%ileft_348 = tt.expand_dims %ileft_347 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166)
|
| 556 |
+
%ileft_349 = tt.broadcast %ileft_348 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167)
|
| 557 |
+
%iright_350 = arith.muli %y_345, %iright : tensor<64x2x1xi32> loc(#loc168)
|
| 558 |
+
%iright_351 = "tt.reduce"(%iright_350) <{axis = 1 : i32}> ({
|
| 559 |
+
^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
|
| 560 |
+
%iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
|
| 561 |
+
tt.reduce.return %iright_379 : i32 loc(#loc203)
|
| 562 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203)
|
| 563 |
+
%iright_352 = tt.expand_dims %iright_351 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170)
|
| 564 |
+
%iright_353 = tt.broadcast %iright_352 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171)
|
| 565 |
+
%ileft_354 = tt.reshape %ileft_349 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172)
|
| 566 |
+
%iright_355 = tt.reshape %iright_353 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173)
|
| 567 |
+
%y_idx_356 = tt.reshape %new_idxs_344 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc174)
|
| 568 |
+
%left_idx_357 = arith.muli %y_idx_356, %ileft : tensor<64x2x1xi32> loc(#loc176)
|
| 569 |
+
%left_idx_358 = "tt.reduce"(%left_idx_357) <{axis = 1 : i32}> ({
|
| 570 |
+
^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
|
| 571 |
+
%left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
|
| 572 |
+
tt.reduce.return %left_idx_379 : i32 loc(#loc206)
|
| 573 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206)
|
| 574 |
+
%left_idx_359 = tt.expand_dims %left_idx_358 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178)
|
| 575 |
+
%left_idx_360 = tt.broadcast %left_idx_359 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179)
|
| 576 |
+
%right_idx_361 = arith.muli %y_idx_356, %iright : tensor<64x2x1xi32> loc(#loc181)
|
| 577 |
+
%right_idx_362 = "tt.reduce"(%right_idx_361) <{axis = 1 : i32}> ({
|
| 578 |
+
^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
|
| 579 |
+
%right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
|
| 580 |
+
tt.reduce.return %right_idx_379 : i32 loc(#loc209)
|
| 581 |
+
}) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209)
|
| 582 |
+
%right_idx_363 = tt.expand_dims %right_idx_362 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183)
|
| 583 |
+
%right_idx_364 = tt.broadcast %right_idx_363 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184)
|
| 584 |
+
%left_idx_365 = tt.reshape %left_idx_360 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185)
|
| 585 |
+
%right_idx_366 = tt.reshape %right_idx_364 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186)
|
| 586 |
+
%cond_367 = arith.cmpi slt, %ileft_354, %iright_355 : tensor<8x16xi32> loc(#loc187)
|
| 587 |
+
%eq_368 = arith.cmpi eq, %ileft_354, %iright_355 : tensor<8x16xi32> loc(#loc188)
|
| 588 |
+
%cond_369 = arith.cmpi sgt, %left_idx_365, %right_idx_366 : tensor<8x16xi32> loc(#loc189)
|
| 589 |
+
%cond_370 = arith.andi %eq_368, %cond_369 : tensor<8x16xi1> loc(#loc190)
|
| 590 |
+
%cond_371 = arith.ori %cond_367, %cond_370 : tensor<8x16xi1> loc(#loc191)
|
| 591 |
+
%new_idxs_372 = arith.xori %left_idx_365, %right_idx_366 : tensor<8x16xi32> loc(#loc197)
|
| 592 |
+
%new_idxs_373 = arith.select %cond_371, %new_idxs_372, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198)
|
| 593 |
+
%new_idxs_374 = arith.xori %new_idxs_344, %new_idxs_373 : tensor<8x16xi32> loc(#loc199)
|
| 594 |
+
%tmp7 = arith.extsi %tmp0_19 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc149)
|
| 595 |
+
%tmp10_375 = arith.select %tmp0_18, %tmp7, %tmp10 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc86)
|
| 596 |
+
%tmp11 = "tt.reduce"(%tmp10_375) <{axis = 1 : i32}> ({
|
| 597 |
+
^bb0(%tmp11_377: i64 loc(callsite(#loc1 at #loc150)), %tmp11_378: i64 loc(callsite(#loc1 at #loc150))):
|
| 598 |
+
%tmp11_379 = arith.addi %tmp11_377, %tmp11_378 : i64 loc(#loc200)
|
| 599 |
+
tt.reduce.return %tmp11_379 : i64 loc(#loc160)
|
| 600 |
+
}) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc160)
|
| 601 |
+
%tmp11_376 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc151)
|
| 602 |
+
%tmp14 = arith.trunci %tmp11_376 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc152)
|
| 603 |
+
%0 = arith.muli %xindex_6, %cst_2 : tensor<8x1xi32> loc(#loc73)
|
| 604 |
+
%1 = tt.broadcast %r0_index_8 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc74)
|
| 605 |
+
%2 = tt.broadcast %0 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc74)
|
| 606 |
+
%3 = arith.addi %1, %2 : tensor<8x16xi32> loc(#loc74)
|
| 607 |
+
%4 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<8x16x!tt.ptr<i32>> loc(#loc75)
|
| 608 |
+
%5 = tt.addptr %4, %3 : tensor<8x16x!tt.ptr<i32>>, tensor<8x16xi32> loc(#loc75)
|
| 609 |
+
tt.store %5, %new_idxs_374, %tmp0_18 : tensor<8x16x!tt.ptr<i32>> loc(#loc76)
|
| 610 |
+
%6 = tt.splat %out_ptr3 : !tt.ptr<i32> -> tensor<8x1x!tt.ptr<i32>> loc(#loc77)
|
| 611 |
+
%7 = tt.addptr %6, %xindex_6 : tensor<8x1x!tt.ptr<i32>>, tensor<8x1xi32> loc(#loc77)
|
| 612 |
+
tt.store %7, %tmp14, %xmask_7 : tensor<8x1x!tt.ptr<i32>> loc(#loc78)
|
| 613 |
+
tt.return loc(#loc79)
|
| 614 |
+
} loc(#loc)
|
| 615 |
+
} loc(#loc)
|
| 616 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":44:34)
|
| 617 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:49)
|
| 618 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:38)
|
| 619 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":26:21)
|
| 620 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:28)
|
| 621 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:33)
|
| 622 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:36)
|
| 623 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:44)
|
| 624 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:23)
|
| 625 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:28)
|
| 626 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:38)
|
| 627 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":33:19)
|
| 628 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":34:19)
|
| 629 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:35)
|
| 630 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:45)
|
| 631 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:30)
|
| 632 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:54)
|
| 633 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":38:19)
|
| 634 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":40:33)
|
| 635 |
+
#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41)
|
| 636 |
+
#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44)
|
| 637 |
+
#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60)
|
| 638 |
+
#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68)
|
| 639 |
+
#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22)
|
| 640 |
+
#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21)
|
| 641 |
+
#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40)
|
| 642 |
+
#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 643 |
+
#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 644 |
+
#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65)
|
| 645 |
+
#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78)
|
| 646 |
+
#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41)
|
| 647 |
+
#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67)
|
| 648 |
+
#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80)
|
| 649 |
+
#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30)
|
| 650 |
+
#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32)
|
| 651 |
+
#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29)
|
| 652 |
+
#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36)
|
| 653 |
+
#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23)
|
| 654 |
+
#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25)
|
| 655 |
+
#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53)
|
| 656 |
+
#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66)
|
| 657 |
+
#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37)
|
| 658 |
+
#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23)
|
| 659 |
+
#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54)
|
| 660 |
+
#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67)
|
| 661 |
+
#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36)
|
| 662 |
+
#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38)
|
| 663 |
+
#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22)
|
| 664 |
+
#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21)
|
| 665 |
+
#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40)
|
| 666 |
+
#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29)
|
| 667 |
+
#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23)
|
| 668 |
+
#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19)
|
| 669 |
+
#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28)
|
| 670 |
+
#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38)
|
| 671 |
+
#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46)
|
| 672 |
+
#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15)
|
| 673 |
+
#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48)
|
| 674 |
+
#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59)
|
| 675 |
+
#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22)
|
| 676 |
+
#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":42:19)
|
| 677 |
+
#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:29)
|
| 678 |
+
#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":48:21)
|
| 679 |
+
#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:35)
|
| 680 |
+
#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:32)
|
| 681 |
+
#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:25)
|
| 682 |
+
#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:47)
|
| 683 |
+
#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:25)
|
| 684 |
+
#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:37)
|
| 685 |
+
#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:4)
|
| 686 |
+
#loc85 = loc(callsite(#loc1 at #loc2))
|
| 687 |
+
#loc86 = loc("tmp10"(#loc3))
|
| 688 |
+
#loc87 = loc("tmp0"(#loc4))
|
| 689 |
+
#loc88 = loc("tmp0"(#loc5))
|
| 690 |
+
#loc89 = loc("xmask"(#loc6))
|
| 691 |
+
#loc90 = loc("xoffset"(#loc7))
|
| 692 |
+
#loc91 = loc("xoffset"(#loc8))
|
| 693 |
+
#loc92 = loc("xindex"(#loc9))
|
| 694 |
+
#loc93 = loc("xindex"(#loc10))
|
| 695 |
+
#loc94 = loc("xindex"(#loc11))
|
| 696 |
+
#loc95 = loc("r0_index"(#loc12))
|
| 697 |
+
#loc96 = loc("r0_index"(#loc13))
|
| 698 |
+
#loc97 = loc("x0"(#loc14))
|
| 699 |
+
#loc98 = loc("x1"(#loc15))
|
| 700 |
+
#loc99 = loc("tmp0"(#loc16))
|
| 701 |
+
#loc100 = loc("tmp0"(#loc17))
|
| 702 |
+
#loc101 = loc("tmp0"(#loc18))
|
| 703 |
+
#loc102 = loc("tmp0"(#loc19))
|
| 704 |
+
#loc103 = loc("tmp2"(#loc20))
|
| 705 |
+
#loc104 = loc("tmp4"(#loc21))
|
| 706 |
+
#loc105 = loc("flip"(#loc22))
|
| 707 |
+
#loc107 = loc("flip"(#loc24))
|
| 708 |
+
#loc108 = loc("flip"(#loc25))
|
| 709 |
+
#loc109 = loc("flip"(#loc26))
|
| 710 |
+
#loc110 = loc("y"(#loc27))
|
| 711 |
+
#loc111 = loc("left_mask"(#loc29))
|
| 712 |
+
#loc112 = loc("ileft"(#loc30))
|
| 713 |
+
#loc114 = loc("ileft"(#loc34))
|
| 714 |
+
#loc115 = loc("ileft"(#loc35))
|
| 715 |
+
#loc116 = loc("iright"(#loc36))
|
| 716 |
+
#loc118 = loc("iright"(#loc38))
|
| 717 |
+
#loc119 = loc("iright"(#loc39))
|
| 718 |
+
#loc120 = loc("ileft"(#loc40))
|
| 719 |
+
#loc121 = loc("iright"(#loc41))
|
| 720 |
+
#loc122 = loc("y_idx"(#loc42))
|
| 721 |
+
#loc123 = loc("left_idx"(#loc43))
|
| 722 |
+
#loc124 = loc("left_idx"(#loc44))
|
| 723 |
+
#loc125 = loc("input"(#loc45))
|
| 724 |
+
#loc127 = loc("left_idx"(#loc47))
|
| 725 |
+
#loc128 = loc("left_idx"(#loc48))
|
| 726 |
+
#loc129 = loc("right_idx"(#loc49))
|
| 727 |
+
#loc130 = loc("right_idx"(#loc50))
|
| 728 |
+
#loc132 = loc("right_idx"(#loc52))
|
| 729 |
+
#loc133 = loc("right_idx"(#loc53))
|
| 730 |
+
#loc134 = loc("left_idx"(#loc54))
|
| 731 |
+
#loc135 = loc("right_idx"(#loc55))
|
| 732 |
+
#loc136 = loc("cond"(#loc56))
|
| 733 |
+
#loc137 = loc("eq"(#loc57))
|
| 734 |
+
#loc138 = loc("cond"(#loc58))
|
| 735 |
+
#loc139 = loc("cond"(#loc59))
|
| 736 |
+
#loc140 = loc("cond"(#loc60))
|
| 737 |
+
#loc141 = loc("cond"(#loc61))
|
| 738 |
+
#loc142 = loc("cond"(#loc62))
|
| 739 |
+
#loc143 = loc("ret"(#loc63))
|
| 740 |
+
#loc144 = loc("ret"(#loc64))
|
| 741 |
+
#loc145 = loc("ret"(#loc65))
|
| 742 |
+
#loc146 = loc("new_idxs"(#loc66))
|
| 743 |
+
#loc147 = loc("new_idxs"(#loc67))
|
| 744 |
+
#loc148 = loc("new_idxs"(#loc68))
|
| 745 |
+
#loc149 = loc("tmp7"(#loc69))
|
| 746 |
+
#loc151 = loc("tmp11"(#loc71))
|
| 747 |
+
#loc152 = loc("tmp14"(#loc72))
|
| 748 |
+
#loc153 = loc(callsite(#loc105 at #loc106))
|
| 749 |
+
#loc154 = loc(callsite(#loc107 at #loc106))
|
| 750 |
+
#loc155 = loc(callsite(#loc108 at #loc106))
|
| 751 |
+
#loc156 = loc(callsite(#loc109 at #loc106))
|
| 752 |
+
#loc158 = loc("cond"(#loc136))
|
| 753 |
+
#loc159 = loc("eq"(#loc137))
|
| 754 |
+
#loc160 = loc(callsite(#loc31 at #loc150))
|
| 755 |
+
#loc162 = loc(callsite(#loc110 at #loc157))
|
| 756 |
+
#loc163 = loc(callsite(#loc111 at #loc157))
|
| 757 |
+
#loc164 = loc(callsite(#loc112 at #loc157))
|
| 758 |
+
#loc166 = loc(callsite(#loc114 at #loc157))
|
| 759 |
+
#loc167 = loc(callsite(#loc115 at #loc157))
|
| 760 |
+
#loc168 = loc(callsite(#loc116 at #loc157))
|
| 761 |
+
#loc170 = loc(callsite(#loc118 at #loc157))
|
| 762 |
+
#loc171 = loc(callsite(#loc119 at #loc157))
|
| 763 |
+
#loc172 = loc(callsite(#loc120 at #loc157))
|
| 764 |
+
#loc173 = loc(callsite(#loc121 at #loc157))
|
| 765 |
+
#loc174 = loc(callsite(#loc122 at #loc157))
|
| 766 |
+
#loc175 = loc(callsite(#loc123 at #loc157))
|
| 767 |
+
#loc176 = loc(callsite(#loc124 at #loc157))
|
| 768 |
+
#loc178 = loc(callsite(#loc127 at #loc157))
|
| 769 |
+
#loc179 = loc(callsite(#loc128 at #loc157))
|
| 770 |
+
#loc180 = loc(callsite(#loc129 at #loc157))
|
| 771 |
+
#loc181 = loc(callsite(#loc130 at #loc157))
|
| 772 |
+
#loc183 = loc(callsite(#loc132 at #loc157))
|
| 773 |
+
#loc184 = loc(callsite(#loc133 at #loc157))
|
| 774 |
+
#loc185 = loc(callsite(#loc134 at #loc157))
|
| 775 |
+
#loc186 = loc(callsite(#loc135 at #loc157))
|
| 776 |
+
#loc187 = loc(callsite(#loc158 at #loc157))
|
| 777 |
+
#loc188 = loc(callsite(#loc159 at #loc157))
|
| 778 |
+
#loc189 = loc(callsite(#loc138 at #loc157))
|
| 779 |
+
#loc190 = loc(callsite(#loc139 at #loc157))
|
| 780 |
+
#loc191 = loc(callsite(#loc140 at #loc157))
|
| 781 |
+
#loc192 = loc(callsite(#loc141 at #loc157))
|
| 782 |
+
#loc193 = loc(callsite(#loc142 at #loc157))
|
| 783 |
+
#loc194 = loc(callsite(#loc143 at #loc157))
|
| 784 |
+
#loc195 = loc(callsite(#loc144 at #loc157))
|
| 785 |
+
#loc196 = loc(callsite(#loc145 at #loc157))
|
| 786 |
+
#loc197 = loc(callsite(#loc146 at #loc157))
|
| 787 |
+
#loc198 = loc(callsite(#loc147 at #loc157))
|
| 788 |
+
#loc199 = loc(callsite(#loc148 at #loc157))
|
| 789 |
+
#loc200 = loc(callsite(#loc33 at #loc160))
|
| 790 |
+
#loc201 = loc(callsite(#loc31 at #loc165))
|
| 791 |
+
#loc203 = loc(callsite(#loc31 at #loc169))
|
| 792 |
+
#loc205 = loc(callsite(#loc125 at #loc177))
|
| 793 |
+
#loc206 = loc(callsite(#loc31 at #loc177))
|
| 794 |
+
#loc208 = loc(callsite(#loc125 at #loc182))
|
| 795 |
+
#loc209 = loc(callsite(#loc31 at #loc182))
|
| 796 |
+
#loc211 = loc(callsite(#loc33 at #loc201))
|
| 797 |
+
#loc212 = loc(callsite(#loc33 at #loc203))
|
| 798 |
+
#loc213 = loc(callsite(#loc33 at #loc206))
|
| 799 |
+
#loc214 = loc(callsite(#loc33 at #loc209))
|
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/__grp__triton_red_fused_argmax_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.json"}}
|
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.cubin
ADDED
|
Binary file (33.6 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "dce51101d74002d90b1fe021a32b8040ce453636ab0b0ab209427e2e72d803f0", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"}
|
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.llir
ADDED
|
@@ -0,0 +1,611 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
@global_smem = external addrspace(3) global [0 x i8], align 16
|
| 6 |
+
|
| 7 |
+
; Function Attrs: nounwind
|
| 8 |
+
define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
|
| 9 |
+
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
|
| 10 |
+
%8 = shl i32 %7, 3, !dbg !8
|
| 11 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
|
| 12 |
+
%10 = and i32 %9, 384, !dbg !9
|
| 13 |
+
%11 = lshr exact i32 %10, 7, !dbg !9
|
| 14 |
+
%12 = or disjoint i32 %11, 4, !dbg !9
|
| 15 |
+
%13 = or disjoint i32 %11, %8, !dbg !10
|
| 16 |
+
%14 = or disjoint i32 %12, %8, !dbg !10
|
| 17 |
+
%15 = shl nuw nsw i32 %9, 2, !dbg !11
|
| 18 |
+
%16 = and i32 %15, 508, !dbg !11
|
| 19 |
+
%17 = sdiv i32 %13, 2048, !dbg !12
|
| 20 |
+
%18 = sdiv i32 %14, 2048, !dbg !12
|
| 21 |
+
%19 = mul i32 %13, 32000
|
| 22 |
+
%20 = mul i32 %17, 224000
|
| 23 |
+
%21 = add i32 %20, %19
|
| 24 |
+
%22 = mul i32 %14, 32000
|
| 25 |
+
%23 = mul i32 %18, 224000
|
| 26 |
+
%24 = add i32 %23, %22
|
| 27 |
+
%25 = zext nneg i32 %16 to i64, !dbg !13
|
| 28 |
+
br label %26, !dbg !13
|
| 29 |
+
|
| 30 |
+
26: ; preds = %6, %26
|
| 31 |
+
%indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %26 ]
|
| 32 |
+
%27 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %126, %26 ]
|
| 33 |
+
%28 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %131, %26 ]
|
| 34 |
+
%29 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %127, %26 ]
|
| 35 |
+
%30 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %132, %26 ]
|
| 36 |
+
%31 = phi <4 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %130, %26 ]
|
| 37 |
+
%32 = phi <4 x i32> [ splat (i32 2147483647), %6 ], [ %133, %26 ]
|
| 38 |
+
%33 = or disjoint i64 %indvars.iv, %25, !dbg !14
|
| 39 |
+
%34 = icmp samesign ult i64 %33, 32000, !dbg !15
|
| 40 |
+
%35 = trunc nuw nsw i64 %33 to i32, !dbg !16
|
| 41 |
+
%36 = add i32 %21, %35, !dbg !16
|
| 42 |
+
%37 = add i32 %24, %35, !dbg !16
|
| 43 |
+
%38 = sext i32 %36 to i64, !dbg !17
|
| 44 |
+
%39 = getelementptr float, ptr addrspace(1) %0, i64 %38, !dbg !17
|
| 45 |
+
%40 = sext i32 %37 to i64, !dbg !17
|
| 46 |
+
%41 = getelementptr float, ptr addrspace(1) %0, i64 %40, !dbg !17
|
| 47 |
+
%42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !18
|
| 48 |
+
%43 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %39, i64 %42, i1 %34) #4, !dbg !18
|
| 49 |
+
%44 = extractvalue { i32, i32, i32, i32 } %43, 0, !dbg !18
|
| 50 |
+
%45 = extractvalue { i32, i32, i32, i32 } %43, 1, !dbg !18
|
| 51 |
+
%46 = extractvalue { i32, i32, i32, i32 } %43, 2, !dbg !18
|
| 52 |
+
%47 = extractvalue { i32, i32, i32, i32 } %43, 3, !dbg !18
|
| 53 |
+
%48 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !18
|
| 54 |
+
%49 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %41, i64 %48, i1 %34) #4, !dbg !18
|
| 55 |
+
%50 = extractvalue { i32, i32, i32, i32 } %49, 0, !dbg !18
|
| 56 |
+
%51 = extractvalue { i32, i32, i32, i32 } %49, 1, !dbg !18
|
| 57 |
+
%52 = extractvalue { i32, i32, i32, i32 } %49, 2, !dbg !18
|
| 58 |
+
%53 = extractvalue { i32, i32, i32, i32 } %49, 3, !dbg !18
|
| 59 |
+
%54 = fcmp uno <2 x float> %27, zeroinitializer, !dbg !19
|
| 60 |
+
%55 = fcmp uno <4 x float> %31, zeroinitializer, !dbg !19
|
| 61 |
+
%56 = fcmp uno <2 x float> %29, zeroinitializer, !dbg !19
|
| 62 |
+
%57 = sext <2 x i32> %28 to <2 x i64>, !dbg !23
|
| 63 |
+
%58 = sext <2 x i32> %30 to <2 x i64>, !dbg !23
|
| 64 |
+
%59 = trunc nuw nsw i64 %33 to i32, !dbg !24
|
| 65 |
+
%60 = or disjoint i32 %59, 1, !dbg !24
|
| 66 |
+
%61 = insertelement <2 x i32> poison, i32 %44, i64 0, !dbg !18
|
| 67 |
+
%62 = insertelement <2 x i32> %61, i32 %45, i64 1, !dbg !18
|
| 68 |
+
%63 = bitcast <2 x i32> %62 to <2 x float>, !dbg !18
|
| 69 |
+
%64 = fcmp ogt <2 x float> %27, %63, !dbg !25
|
| 70 |
+
%65 = fcmp oeq <2 x float> %27, %63, !dbg !26
|
| 71 |
+
%66 = fcmp uno <2 x float> %63, zeroinitializer, !dbg !27
|
| 72 |
+
%67 = xor <2 x i1> %66, splat (i1 true), !dbg !28
|
| 73 |
+
%68 = and <2 x i1> %54, %67, !dbg !29
|
| 74 |
+
%69 = or <2 x i1> %64, %68, !dbg !30
|
| 75 |
+
%70 = and <2 x i1> %54, %66, !dbg !31
|
| 76 |
+
%71 = or <2 x i1> %65, %70, !dbg !32
|
| 77 |
+
%72 = insertelement <2 x i64> poison, i64 %33, i64 0, !dbg !23
|
| 78 |
+
%73 = shufflevector <2 x i64> %72, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !23
|
| 79 |
+
%74 = icmp sgt <2 x i64> %73, %57, !dbg !23
|
| 80 |
+
%75 = icmp sge <2 x i64> %73, %57, !dbg !23
|
| 81 |
+
%76 = shufflevector <2 x i1> %74, <2 x i1> %75, <2 x i32> <i32 0, i32 3>, !dbg !23
|
| 82 |
+
%77 = and <2 x i1> %76, %71, !dbg !33
|
| 83 |
+
%78 = or <2 x i1> %69, %77, !dbg !34
|
| 84 |
+
%79 = select <2 x i1> %78, <2 x float> %27, <2 x float> %63, !dbg !35
|
| 85 |
+
%80 = insertelement <2 x i32> poison, i32 %35, i64 0, !dbg !24
|
| 86 |
+
%81 = insertelement <2 x i32> %80, i32 %60, i64 1, !dbg !24
|
| 87 |
+
%82 = select <2 x i1> %78, <2 x i32> %28, <2 x i32> %81, !dbg !24
|
| 88 |
+
%83 = insertelement <2 x i32> poison, i32 %50, i64 0, !dbg !18
|
| 89 |
+
%84 = insertelement <2 x i32> %83, i32 %51, i64 1, !dbg !18
|
| 90 |
+
%85 = bitcast <2 x i32> %84 to <2 x float>, !dbg !18
|
| 91 |
+
%86 = fcmp ogt <2 x float> %29, %85, !dbg !25
|
| 92 |
+
%87 = fcmp oeq <2 x float> %29, %85, !dbg !26
|
| 93 |
+
%88 = fcmp uno <2 x float> %85, zeroinitializer, !dbg !27
|
| 94 |
+
%89 = xor <2 x i1> %88, splat (i1 true), !dbg !28
|
| 95 |
+
%90 = and <2 x i1> %56, %89, !dbg !29
|
| 96 |
+
%91 = or <2 x i1> %86, %90, !dbg !30
|
| 97 |
+
%92 = and <2 x i1> %56, %88, !dbg !31
|
| 98 |
+
%93 = or <2 x i1> %87, %92, !dbg !32
|
| 99 |
+
%94 = icmp sgt <2 x i64> %73, %58, !dbg !23
|
| 100 |
+
%95 = icmp sge <2 x i64> %73, %58, !dbg !23
|
| 101 |
+
%96 = shufflevector <2 x i1> %94, <2 x i1> %95, <2 x i32> <i32 0, i32 3>, !dbg !23
|
| 102 |
+
%97 = and <2 x i1> %96, %93, !dbg !33
|
| 103 |
+
%98 = or <2 x i1> %91, %97, !dbg !34
|
| 104 |
+
%99 = select <2 x i1> %98, <2 x float> %29, <2 x float> %85, !dbg !35
|
| 105 |
+
%100 = select <2 x i1> %98, <2 x i32> %30, <2 x i32> %81, !dbg !24
|
| 106 |
+
%101 = or disjoint <2 x i64> %73, <i64 3, i64 2>, !dbg !14
|
| 107 |
+
%102 = shufflevector <2 x i64> %101, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>, !dbg !14
|
| 108 |
+
%103 = insertelement <4 x i32> poison, i32 %53, i64 0, !dbg !18
|
| 109 |
+
%104 = insertelement <4 x i32> %103, i32 %52, i64 1, !dbg !18
|
| 110 |
+
%105 = insertelement <4 x i32> %104, i32 %47, i64 2, !dbg !18
|
| 111 |
+
%106 = insertelement <4 x i32> %105, i32 %46, i64 3, !dbg !18
|
| 112 |
+
%107 = bitcast <4 x i32> %106 to <4 x float>, !dbg !18
|
| 113 |
+
%108 = fcmp ogt <4 x float> %31, %107, !dbg !25
|
| 114 |
+
%109 = fcmp oeq <4 x float> %31, %107, !dbg !26
|
| 115 |
+
%110 = fcmp uno <4 x float> %107, zeroinitializer, !dbg !27
|
| 116 |
+
%111 = xor <4 x i1> %110, splat (i1 true), !dbg !28
|
| 117 |
+
%112 = and <4 x i1> %55, %111, !dbg !29
|
| 118 |
+
%113 = or <4 x i1> %108, %112, !dbg !30
|
| 119 |
+
%114 = and <4 x i1> %55, %110, !dbg !31
|
| 120 |
+
%115 = or <4 x i1> %109, %114, !dbg !32
|
| 121 |
+
%116 = sext <4 x i32> %32 to <4 x i64>, !dbg !23
|
| 122 |
+
%117 = icmp sgt <4 x i64> %102, %116, !dbg !23
|
| 123 |
+
%118 = and <4 x i1> %117, %115, !dbg !33
|
| 124 |
+
%119 = or <4 x i1> %113, %118, !dbg !34
|
| 125 |
+
%120 = select <4 x i1> %119, <4 x float> %31, <4 x float> %107, !dbg !35
|
| 126 |
+
%121 = trunc <2 x i64> %101 to <2 x i32>, !dbg !24
|
| 127 |
+
%122 = shufflevector <2 x i32> %121, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>, !dbg !24
|
| 128 |
+
%123 = select <4 x i1> %119, <4 x i32> %32, <4 x i32> %122, !dbg !24
|
| 129 |
+
%124 = insertelement <2 x i1> poison, i1 %34, i64 0, !dbg !36
|
| 130 |
+
%125 = shufflevector <2 x i1> %124, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !36
|
| 131 |
+
%126 = select <2 x i1> %125, <2 x float> %79, <2 x float> %27, !dbg !36
|
| 132 |
+
%127 = select <2 x i1> %125, <2 x float> %99, <2 x float> %29, !dbg !36
|
| 133 |
+
%128 = insertelement <4 x i1> poison, i1 %34, i64 0, !dbg !36
|
| 134 |
+
%129 = shufflevector <4 x i1> %128, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !36
|
| 135 |
+
%130 = select <4 x i1> %129, <4 x float> %120, <4 x float> %31, !dbg !36
|
| 136 |
+
%131 = select <2 x i1> %125, <2 x i32> %82, <2 x i32> %28, !dbg !37
|
| 137 |
+
%132 = select <2 x i1> %125, <2 x i32> %100, <2 x i32> %30, !dbg !37
|
| 138 |
+
%133 = select <4 x i1> %129, <4 x i32> %123, <4 x i32> %32, !dbg !37
|
| 139 |
+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 512, !dbg !13
|
| 140 |
+
%134 = icmp samesign ult i64 %indvars.iv, 31488, !dbg !13
|
| 141 |
+
br i1 %134, label %26, label %135, !dbg !13
|
| 142 |
+
|
| 143 |
+
135: ; preds = %26
|
| 144 |
+
%136 = and i32 %9, 7, !dbg !9
|
| 145 |
+
%137 = or disjoint i32 %8, %136, !dbg !10
|
| 146 |
+
%138 = and i32 %9, 31, !dbg !9
|
| 147 |
+
%139 = lshr i32 %9, 5, !dbg !9
|
| 148 |
+
%140 = shufflevector <2 x float> %126, <2 x float> poison, <2 x i32> <i32 1, i32 0>, !dbg !38
|
| 149 |
+
%141 = fcmp ogt <2 x float> %126, %140, !dbg !38
|
| 150 |
+
%142 = fcmp oeq <2 x float> %126, %140, !dbg !38
|
| 151 |
+
%143 = shufflevector <2 x i1> %141, <2 x i1> %142, <2 x i32> <i32 0, i32 3>, !dbg !38
|
| 152 |
+
%144 = extractelement <2 x float> %126, i64 0, !dbg !40
|
| 153 |
+
%145 = fcmp uno float %144, 0.000000e+00, !dbg !40
|
| 154 |
+
%146 = extractelement <2 x float> %126, i64 1, !dbg !41
|
| 155 |
+
%147 = fcmp uno float %146, 0.000000e+00, !dbg !41
|
| 156 |
+
%148 = xor i1 %147, true, !dbg !42
|
| 157 |
+
%149 = insertelement <2 x i1> poison, i1 %145, i64 0, !dbg !43
|
| 158 |
+
%150 = shufflevector <2 x i1> %149, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !43
|
| 159 |
+
%151 = insertelement <2 x i1> poison, i1 %148, i64 0, !dbg !43
|
| 160 |
+
%152 = insertelement <2 x i1> %151, i1 %147, i64 1, !dbg !43
|
| 161 |
+
%153 = and <2 x i1> %150, %152, !dbg !43
|
| 162 |
+
%154 = or <2 x i1> %143, %153, !dbg !44
|
| 163 |
+
%155 = extractelement <2 x i32> %131, i64 0, !dbg !45
|
| 164 |
+
%156 = extractelement <2 x i32> %131, i64 1, !dbg !45
|
| 165 |
+
%157 = icmp slt i32 %155, %156, !dbg !45
|
| 166 |
+
%158 = extractelement <2 x i1> %154, i64 1, !dbg !46
|
| 167 |
+
%159 = and i1 %157, %158, !dbg !46
|
| 168 |
+
%160 = extractelement <2 x i1> %154, i64 0, !dbg !47
|
| 169 |
+
%161 = or i1 %160, %159, !dbg !47
|
| 170 |
+
%162 = select i1 %161, float %144, float %146, !dbg !48
|
| 171 |
+
%163 = select i1 %161, i32 %155, i32 %156, !dbg !49
|
| 172 |
+
%164 = extractelement <4 x float> %130, i64 3, !dbg !38
|
| 173 |
+
%165 = fcmp ogt float %162, %164, !dbg !38
|
| 174 |
+
%166 = fcmp oeq float %162, %164, !dbg !50
|
| 175 |
+
%167 = fcmp uno float %162, 0.000000e+00, !dbg !40
|
| 176 |
+
%168 = fcmp uno <4 x float> %130, zeroinitializer, !dbg !41
|
| 177 |
+
%169 = extractelement <4 x i1> %168, i64 3, !dbg !51
|
| 178 |
+
%170 = xor i1 %169, true, !dbg !42
|
| 179 |
+
%171 = and i1 %167, %170, !dbg !43
|
| 180 |
+
%172 = or i1 %165, %171, !dbg !44
|
| 181 |
+
%173 = and i1 %169, %167, !dbg !51
|
| 182 |
+
%174 = or i1 %166, %173, !dbg !52
|
| 183 |
+
%175 = extractelement <4 x i32> %133, i64 3, !dbg !45
|
| 184 |
+
%176 = icmp slt i32 %163, %175, !dbg !45
|
| 185 |
+
%177 = and i1 %176, %174, !dbg !46
|
| 186 |
+
%178 = or i1 %172, %177, !dbg !47
|
| 187 |
+
%179 = select i1 %178, float %162, float %164, !dbg !48
|
| 188 |
+
%180 = select i1 %178, i32 %163, i32 %175, !dbg !49
|
| 189 |
+
%181 = extractelement <4 x float> %130, i64 2, !dbg !38
|
| 190 |
+
%182 = fcmp ogt float %179, %181, !dbg !38
|
| 191 |
+
%183 = fcmp oeq float %179, %181, !dbg !50
|
| 192 |
+
%184 = fcmp uno float %179, 0.000000e+00, !dbg !40
|
| 193 |
+
%185 = extractelement <4 x i1> %168, i64 2, !dbg !51
|
| 194 |
+
%186 = xor i1 %185, true, !dbg !42
|
| 195 |
+
%187 = and i1 %184, %186, !dbg !43
|
| 196 |
+
%188 = or i1 %182, %187, !dbg !44
|
| 197 |
+
%189 = and i1 %185, %184, !dbg !51
|
| 198 |
+
%190 = or i1 %183, %189, !dbg !52
|
| 199 |
+
%191 = extractelement <4 x i32> %133, i64 2, !dbg !45
|
| 200 |
+
%192 = icmp slt i32 %180, %191, !dbg !45
|
| 201 |
+
%193 = and i1 %192, %190, !dbg !46
|
| 202 |
+
%194 = or i1 %188, %193, !dbg !47
|
| 203 |
+
%195 = select i1 %194, float %179, float %181, !dbg !48
|
| 204 |
+
%196 = select i1 %194, i32 %180, i32 %191, !dbg !49
|
| 205 |
+
%197 = shufflevector <2 x float> %127, <2 x float> poison, <2 x i32> <i32 1, i32 0>, !dbg !38
|
| 206 |
+
%198 = fcmp ogt <2 x float> %127, %197, !dbg !38
|
| 207 |
+
%199 = fcmp oeq <2 x float> %127, %197, !dbg !38
|
| 208 |
+
%200 = shufflevector <2 x i1> %198, <2 x i1> %199, <2 x i32> <i32 0, i32 3>, !dbg !38
|
| 209 |
+
%201 = extractelement <2 x float> %127, i64 0, !dbg !40
|
| 210 |
+
%202 = fcmp uno float %201, 0.000000e+00, !dbg !40
|
| 211 |
+
%203 = extractelement <2 x float> %127, i64 1, !dbg !41
|
| 212 |
+
%204 = fcmp uno float %203, 0.000000e+00, !dbg !41
|
| 213 |
+
%205 = xor i1 %204, true, !dbg !42
|
| 214 |
+
%206 = insertelement <2 x i1> poison, i1 %202, i64 0, !dbg !43
|
| 215 |
+
%207 = shufflevector <2 x i1> %206, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !43
|
| 216 |
+
%208 = insertelement <2 x i1> poison, i1 %205, i64 0, !dbg !43
|
| 217 |
+
%209 = insertelement <2 x i1> %208, i1 %204, i64 1, !dbg !43
|
| 218 |
+
%210 = and <2 x i1> %207, %209, !dbg !43
|
| 219 |
+
%211 = or <2 x i1> %200, %210, !dbg !44
|
| 220 |
+
%212 = extractelement <2 x i32> %132, i64 0, !dbg !45
|
| 221 |
+
%213 = extractelement <2 x i32> %132, i64 1, !dbg !45
|
| 222 |
+
%214 = icmp slt i32 %212, %213, !dbg !45
|
| 223 |
+
%215 = extractelement <2 x i1> %211, i64 1, !dbg !46
|
| 224 |
+
%216 = and i1 %214, %215, !dbg !46
|
| 225 |
+
%217 = extractelement <2 x i1> %211, i64 0, !dbg !47
|
| 226 |
+
%218 = or i1 %217, %216, !dbg !47
|
| 227 |
+
%219 = select i1 %218, float %201, float %203, !dbg !48
|
| 228 |
+
%220 = select i1 %218, i32 %212, i32 %213, !dbg !49
|
| 229 |
+
%221 = extractelement <4 x float> %130, i64 1, !dbg !38
|
| 230 |
+
%222 = fcmp ogt float %219, %221, !dbg !38
|
| 231 |
+
%223 = fcmp oeq float %219, %221, !dbg !50
|
| 232 |
+
%224 = fcmp uno float %219, 0.000000e+00, !dbg !40
|
| 233 |
+
%225 = extractelement <4 x i1> %168, i64 1, !dbg !51
|
| 234 |
+
%226 = xor i1 %225, true, !dbg !42
|
| 235 |
+
%227 = and i1 %224, %226, !dbg !43
|
| 236 |
+
%228 = or i1 %222, %227, !dbg !44
|
| 237 |
+
%229 = and i1 %225, %224, !dbg !51
|
| 238 |
+
%230 = or i1 %223, %229, !dbg !52
|
| 239 |
+
%231 = extractelement <4 x i32> %133, i64 1, !dbg !45
|
| 240 |
+
%232 = icmp slt i32 %220, %231, !dbg !45
|
| 241 |
+
%233 = and i1 %232, %230, !dbg !46
|
| 242 |
+
%234 = or i1 %228, %233, !dbg !47
|
| 243 |
+
%235 = select i1 %234, float %219, float %221, !dbg !48
|
| 244 |
+
%236 = select i1 %234, i32 %220, i32 %231, !dbg !49
|
| 245 |
+
%237 = extractelement <4 x float> %130, i64 0, !dbg !38
|
| 246 |
+
%238 = fcmp ogt float %235, %237, !dbg !38
|
| 247 |
+
%239 = fcmp oeq float %235, %237, !dbg !50
|
| 248 |
+
%240 = fcmp uno float %235, 0.000000e+00, !dbg !40
|
| 249 |
+
%241 = extractelement <4 x i1> %168, i64 0, !dbg !51
|
| 250 |
+
%242 = xor i1 %241, true, !dbg !42
|
| 251 |
+
%243 = and i1 %240, %242, !dbg !43
|
| 252 |
+
%244 = or i1 %238, %243, !dbg !44
|
| 253 |
+
%245 = and i1 %241, %240, !dbg !51
|
| 254 |
+
%246 = or i1 %239, %245, !dbg !52
|
| 255 |
+
%247 = extractelement <4 x i32> %133, i64 0, !dbg !45
|
| 256 |
+
%248 = icmp slt i32 %236, %247, !dbg !45
|
| 257 |
+
%249 = and i1 %248, %246, !dbg !46
|
| 258 |
+
%250 = or i1 %244, %249, !dbg !47
|
| 259 |
+
%251 = select i1 %250, float %235, float %237, !dbg !48
|
| 260 |
+
%252 = select i1 %250, i32 %236, i32 %247, !dbg !49
|
| 261 |
+
%253 = bitcast float %195 to i32, !dbg !53
|
| 262 |
+
%254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %253, i32 16, i32 31), !dbg !53
|
| 263 |
+
%255 = bitcast i32 %254 to float, !dbg !53
|
| 264 |
+
%256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 16, i32 31), !dbg !53
|
| 265 |
+
%257 = fcmp ogt float %195, %255, !dbg !38
|
| 266 |
+
%258 = fcmp oeq float %195, %255, !dbg !50
|
| 267 |
+
%259 = fcmp uno float %195, 0.000000e+00, !dbg !40
|
| 268 |
+
%260 = fcmp uno float %255, 0.000000e+00, !dbg !41
|
| 269 |
+
%261 = xor i1 %260, true, !dbg !42
|
| 270 |
+
%262 = and i1 %259, %261, !dbg !43
|
| 271 |
+
%263 = or i1 %257, %262, !dbg !44
|
| 272 |
+
%264 = and i1 %259, %260, !dbg !51
|
| 273 |
+
%265 = or i1 %258, %264, !dbg !52
|
| 274 |
+
%266 = icmp slt i32 %196, %256, !dbg !45
|
| 275 |
+
%267 = and i1 %266, %265, !dbg !46
|
| 276 |
+
%268 = or i1 %263, %267, !dbg !47
|
| 277 |
+
%269 = select i1 %268, float %195, float %255, !dbg !48
|
| 278 |
+
%270 = select i1 %268, i32 %196, i32 %256, !dbg !49
|
| 279 |
+
%271 = bitcast float %269 to i32, !dbg !53
|
| 280 |
+
%272 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %271, i32 8, i32 31), !dbg !53
|
| 281 |
+
%273 = bitcast i32 %272 to float, !dbg !53
|
| 282 |
+
%274 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %270, i32 8, i32 31), !dbg !53
|
| 283 |
+
%275 = fcmp ogt float %269, %273, !dbg !38
|
| 284 |
+
%276 = fcmp oeq float %269, %273, !dbg !50
|
| 285 |
+
%277 = fcmp uno float %269, 0.000000e+00, !dbg !40
|
| 286 |
+
%278 = fcmp uno float %273, 0.000000e+00, !dbg !41
|
| 287 |
+
%279 = xor i1 %278, true, !dbg !42
|
| 288 |
+
%280 = and i1 %277, %279, !dbg !43
|
| 289 |
+
%281 = or i1 %275, %280, !dbg !44
|
| 290 |
+
%282 = and i1 %278, %277, !dbg !51
|
| 291 |
+
%283 = or i1 %276, %282, !dbg !52
|
| 292 |
+
%284 = icmp slt i32 %270, %274, !dbg !45
|
| 293 |
+
%285 = and i1 %284, %283, !dbg !46
|
| 294 |
+
%286 = or i1 %281, %285, !dbg !47
|
| 295 |
+
%287 = select i1 %286, float %269, float %273, !dbg !48
|
| 296 |
+
%288 = select i1 %286, i32 %270, i32 %274, !dbg !49
|
| 297 |
+
%289 = bitcast float %287 to i32, !dbg !53
|
| 298 |
+
%290 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %289, i32 4, i32 31), !dbg !53
|
| 299 |
+
%291 = bitcast i32 %290 to float, !dbg !53
|
| 300 |
+
%292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %288, i32 4, i32 31), !dbg !53
|
| 301 |
+
%293 = fcmp ogt float %287, %291, !dbg !38
|
| 302 |
+
%294 = fcmp oeq float %287, %291, !dbg !50
|
| 303 |
+
%295 = fcmp uno float %287, 0.000000e+00, !dbg !40
|
| 304 |
+
%296 = fcmp uno float %291, 0.000000e+00, !dbg !41
|
| 305 |
+
%297 = xor i1 %296, true, !dbg !42
|
| 306 |
+
%298 = and i1 %295, %297, !dbg !43
|
| 307 |
+
%299 = or i1 %293, %298, !dbg !44
|
| 308 |
+
%300 = and i1 %296, %295, !dbg !51
|
| 309 |
+
%301 = or i1 %294, %300, !dbg !52
|
| 310 |
+
%302 = icmp slt i32 %288, %292, !dbg !45
|
| 311 |
+
%303 = and i1 %302, %301, !dbg !46
|
| 312 |
+
%304 = or i1 %299, %303, !dbg !47
|
| 313 |
+
%305 = select i1 %304, float %287, float %291, !dbg !48
|
| 314 |
+
%306 = select i1 %304, i32 %288, i32 %292, !dbg !49
|
| 315 |
+
%307 = bitcast float %305 to i32, !dbg !53
|
| 316 |
+
%308 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %307, i32 2, i32 31), !dbg !53
|
| 317 |
+
%309 = bitcast i32 %308 to float, !dbg !53
|
| 318 |
+
%310 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 2, i32 31), !dbg !53
|
| 319 |
+
%311 = fcmp ogt float %305, %309, !dbg !38
|
| 320 |
+
%312 = fcmp oeq float %305, %309, !dbg !50
|
| 321 |
+
%313 = fcmp uno float %305, 0.000000e+00, !dbg !40
|
| 322 |
+
%314 = fcmp uno float %309, 0.000000e+00, !dbg !41
|
| 323 |
+
%315 = xor i1 %314, true, !dbg !42
|
| 324 |
+
%316 = and i1 %313, %315, !dbg !43
|
| 325 |
+
%317 = or i1 %311, %316, !dbg !44
|
| 326 |
+
%318 = and i1 %314, %313, !dbg !51
|
| 327 |
+
%319 = or i1 %312, %318, !dbg !52
|
| 328 |
+
%320 = icmp slt i32 %306, %310, !dbg !45
|
| 329 |
+
%321 = and i1 %320, %319, !dbg !46
|
| 330 |
+
%322 = or i1 %317, %321, !dbg !47
|
| 331 |
+
%323 = select i1 %322, float %305, float %309, !dbg !48
|
| 332 |
+
%324 = select i1 %322, i32 %306, i32 %310, !dbg !49
|
| 333 |
+
%325 = bitcast float %323 to i32, !dbg !53
|
| 334 |
+
%326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 1, i32 31), !dbg !53
|
| 335 |
+
%327 = bitcast i32 %326 to float, !dbg !53
|
| 336 |
+
%328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 1, i32 31), !dbg !53
|
| 337 |
+
%329 = fcmp ogt float %323, %327, !dbg !38
|
| 338 |
+
%330 = fcmp oeq float %323, %327, !dbg !50
|
| 339 |
+
%331 = fcmp uno float %323, 0.000000e+00, !dbg !40
|
| 340 |
+
%332 = fcmp uno float %327, 0.000000e+00, !dbg !41
|
| 341 |
+
%333 = xor i1 %332, true, !dbg !42
|
| 342 |
+
%334 = and i1 %331, %333, !dbg !43
|
| 343 |
+
%335 = or i1 %329, %334, !dbg !44
|
| 344 |
+
%336 = and i1 %332, %331, !dbg !51
|
| 345 |
+
%337 = or i1 %330, %336, !dbg !52
|
| 346 |
+
%338 = icmp slt i32 %324, %328, !dbg !45
|
| 347 |
+
%339 = and i1 %338, %337, !dbg !46
|
| 348 |
+
%340 = or i1 %335, %339, !dbg !47
|
| 349 |
+
%341 = select i1 %340, i32 %324, i32 %328, !dbg !49
|
| 350 |
+
%342 = bitcast float %251 to i32, !dbg !53
|
| 351 |
+
%343 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %342, i32 16, i32 31), !dbg !53
|
| 352 |
+
%344 = bitcast i32 %343 to float, !dbg !53
|
| 353 |
+
%345 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 16, i32 31), !dbg !53
|
| 354 |
+
%346 = fcmp ogt float %251, %344, !dbg !38
|
| 355 |
+
%347 = fcmp oeq float %251, %344, !dbg !50
|
| 356 |
+
%348 = fcmp uno float %251, 0.000000e+00, !dbg !40
|
| 357 |
+
%349 = fcmp uno float %344, 0.000000e+00, !dbg !41
|
| 358 |
+
%350 = xor i1 %349, true, !dbg !42
|
| 359 |
+
%351 = and i1 %348, %350, !dbg !43
|
| 360 |
+
%352 = or i1 %346, %351, !dbg !44
|
| 361 |
+
%353 = and i1 %348, %349, !dbg !51
|
| 362 |
+
%354 = or i1 %347, %353, !dbg !52
|
| 363 |
+
%355 = icmp slt i32 %252, %345, !dbg !45
|
| 364 |
+
%356 = and i1 %355, %354, !dbg !46
|
| 365 |
+
%357 = or i1 %352, %356, !dbg !47
|
| 366 |
+
%358 = select i1 %357, float %251, float %344, !dbg !48
|
| 367 |
+
%359 = select i1 %357, i32 %252, i32 %345, !dbg !49
|
| 368 |
+
%360 = bitcast float %358 to i32, !dbg !53
|
| 369 |
+
%361 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %360, i32 8, i32 31), !dbg !53
|
| 370 |
+
%362 = bitcast i32 %361 to float, !dbg !53
|
| 371 |
+
%363 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %359, i32 8, i32 31), !dbg !53
|
| 372 |
+
%364 = fcmp ogt float %358, %362, !dbg !38
|
| 373 |
+
%365 = fcmp oeq float %358, %362, !dbg !50
|
| 374 |
+
%366 = fcmp uno float %358, 0.000000e+00, !dbg !40
|
| 375 |
+
%367 = fcmp uno float %362, 0.000000e+00, !dbg !41
|
| 376 |
+
%368 = xor i1 %367, true, !dbg !42
|
| 377 |
+
%369 = and i1 %366, %368, !dbg !43
|
| 378 |
+
%370 = or i1 %364, %369, !dbg !44
|
| 379 |
+
%371 = and i1 %367, %366, !dbg !51
|
| 380 |
+
%372 = or i1 %365, %371, !dbg !52
|
| 381 |
+
%373 = icmp slt i32 %359, %363, !dbg !45
|
| 382 |
+
%374 = and i1 %373, %372, !dbg !46
|
| 383 |
+
%375 = or i1 %370, %374, !dbg !47
|
| 384 |
+
%376 = select i1 %375, float %358, float %362, !dbg !48
|
| 385 |
+
%377 = select i1 %375, i32 %359, i32 %363, !dbg !49
|
| 386 |
+
%378 = bitcast float %376 to i32, !dbg !53
|
| 387 |
+
%379 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %378, i32 4, i32 31), !dbg !53
|
| 388 |
+
%380 = bitcast i32 %379 to float, !dbg !53
|
| 389 |
+
%381 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %377, i32 4, i32 31), !dbg !53
|
| 390 |
+
%382 = fcmp ogt float %376, %380, !dbg !38
|
| 391 |
+
%383 = fcmp oeq float %376, %380, !dbg !50
|
| 392 |
+
%384 = fcmp uno float %376, 0.000000e+00, !dbg !40
|
| 393 |
+
%385 = fcmp uno float %380, 0.000000e+00, !dbg !41
|
| 394 |
+
%386 = xor i1 %385, true, !dbg !42
|
| 395 |
+
%387 = and i1 %384, %386, !dbg !43
|
| 396 |
+
%388 = or i1 %382, %387, !dbg !44
|
| 397 |
+
%389 = and i1 %385, %384, !dbg !51
|
| 398 |
+
%390 = or i1 %383, %389, !dbg !52
|
| 399 |
+
%391 = icmp slt i32 %377, %381, !dbg !45
|
| 400 |
+
%392 = and i1 %391, %390, !dbg !46
|
| 401 |
+
%393 = or i1 %388, %392, !dbg !47
|
| 402 |
+
%394 = select i1 %393, float %376, float %380, !dbg !48
|
| 403 |
+
%395 = select i1 %393, i32 %377, i32 %381, !dbg !49
|
| 404 |
+
%396 = bitcast float %394 to i32, !dbg !53
|
| 405 |
+
%397 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %396, i32 2, i32 31), !dbg !53
|
| 406 |
+
%398 = bitcast i32 %397 to float, !dbg !53
|
| 407 |
+
%399 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %395, i32 2, i32 31), !dbg !53
|
| 408 |
+
%400 = fcmp ogt float %394, %398, !dbg !38
|
| 409 |
+
%401 = fcmp oeq float %394, %398, !dbg !50
|
| 410 |
+
%402 = fcmp uno float %394, 0.000000e+00, !dbg !40
|
| 411 |
+
%403 = fcmp uno float %398, 0.000000e+00, !dbg !41
|
| 412 |
+
%404 = xor i1 %403, true, !dbg !42
|
| 413 |
+
%405 = and i1 %402, %404, !dbg !43
|
| 414 |
+
%406 = or i1 %400, %405, !dbg !44
|
| 415 |
+
%407 = and i1 %403, %402, !dbg !51
|
| 416 |
+
%408 = or i1 %401, %407, !dbg !52
|
| 417 |
+
%409 = icmp slt i32 %395, %399, !dbg !45
|
| 418 |
+
%410 = and i1 %409, %408, !dbg !46
|
| 419 |
+
%411 = or i1 %406, %410, !dbg !47
|
| 420 |
+
%412 = select i1 %411, float %394, float %398, !dbg !48
|
| 421 |
+
%413 = select i1 %411, i32 %395, i32 %399, !dbg !49
|
| 422 |
+
%414 = bitcast float %412 to i32, !dbg !53
|
| 423 |
+
%415 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %414, i32 1, i32 31), !dbg !53
|
| 424 |
+
%416 = bitcast i32 %415 to float, !dbg !53
|
| 425 |
+
%417 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %413, i32 1, i32 31), !dbg !53
|
| 426 |
+
%418 = fcmp ogt float %412, %416, !dbg !38
|
| 427 |
+
%419 = fcmp oeq float %412, %416, !dbg !50
|
| 428 |
+
%420 = fcmp uno float %412, 0.000000e+00, !dbg !40
|
| 429 |
+
%421 = fcmp uno float %416, 0.000000e+00, !dbg !41
|
| 430 |
+
%422 = xor i1 %421, true, !dbg !42
|
| 431 |
+
%423 = and i1 %420, %422, !dbg !43
|
| 432 |
+
%424 = or i1 %418, %423, !dbg !44
|
| 433 |
+
%425 = and i1 %421, %420, !dbg !51
|
| 434 |
+
%426 = or i1 %419, %425, !dbg !52
|
| 435 |
+
%427 = icmp slt i32 %413, %417, !dbg !45
|
| 436 |
+
%428 = and i1 %427, %426, !dbg !46
|
| 437 |
+
%429 = or i1 %424, %428, !dbg !47
|
| 438 |
+
%430 = select i1 %429, i32 %413, i32 %417, !dbg !49
|
| 439 |
+
%431 = and i32 %139, 3, !dbg !53
|
| 440 |
+
%432 = icmp eq i32 %138, 0, !dbg !53
|
| 441 |
+
%433 = lshr exact i32 %10, 5, !dbg !53
|
| 442 |
+
%434 = or disjoint i32 %433, %431, !dbg !53
|
| 443 |
+
%435 = getelementptr float, ptr addrspace(3) @global_smem, i32 %434, !dbg !53
|
| 444 |
+
%436 = select i1 %340, i32 %325, i32 %326, !dbg !48
|
| 445 |
+
%437 = insertelement <1 x i32> poison, i32 %436, i64 0, !dbg !53
|
| 446 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %435, <1 x i32> %437, i1 %432) #4, !dbg !53
|
| 447 |
+
%438 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %434, !dbg !53
|
| 448 |
+
%439 = insertelement <1 x i32> poison, i32 %341, i64 0, !dbg !53
|
| 449 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %438, <1 x i32> %439, i1 %432) #4, !dbg !53
|
| 450 |
+
%440 = shl nuw nsw i32 %12, 2, !dbg !53
|
| 451 |
+
%441 = or disjoint i32 %440, %431, !dbg !53
|
| 452 |
+
%442 = getelementptr float, ptr addrspace(3) @global_smem, i32 %441, !dbg !53
|
| 453 |
+
%443 = select i1 %429, i32 %414, i32 %415, !dbg !48
|
| 454 |
+
%444 = insertelement <1 x i32> poison, i32 %443, i64 0, !dbg !53
|
| 455 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %442, <1 x i32> %444, i1 %432) #4, !dbg !53
|
| 456 |
+
%445 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %441, !dbg !53
|
| 457 |
+
%446 = insertelement <1 x i32> poison, i32 %430, i64 0, !dbg !53
|
| 458 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %445, <1 x i32> %446, i1 %432) #4, !dbg !53
|
| 459 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
|
| 460 |
+
%447 = icmp samesign ult i32 %9, 32, !dbg !53
|
| 461 |
+
%448 = getelementptr float, ptr addrspace(3) @global_smem, i32 %9, !dbg !53
|
| 462 |
+
%449 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %448, i1 %447) #4, !dbg !53
|
| 463 |
+
%450 = bitcast i32 %449 to float, !dbg !53
|
| 464 |
+
%451 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %9, !dbg !53
|
| 465 |
+
%452 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %451, i1 %447) #4, !dbg !53
|
| 466 |
+
%453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %449, i32 2, i32 31), !dbg !53
|
| 467 |
+
%454 = bitcast i32 %453 to float, !dbg !53
|
| 468 |
+
%455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 2, i32 31), !dbg !53
|
| 469 |
+
%456 = fcmp ogt float %450, %454, !dbg !38
|
| 470 |
+
%457 = fcmp oeq float %450, %454, !dbg !50
|
| 471 |
+
%458 = fcmp uno float %450, 0.000000e+00, !dbg !40
|
| 472 |
+
%459 = fcmp uno float %454, 0.000000e+00, !dbg !41
|
| 473 |
+
%460 = xor i1 %459, true, !dbg !42
|
| 474 |
+
%461 = and i1 %458, %460, !dbg !43
|
| 475 |
+
%462 = or i1 %456, %461, !dbg !44
|
| 476 |
+
%463 = and i1 %458, %459, !dbg !51
|
| 477 |
+
%464 = or i1 %457, %463, !dbg !52
|
| 478 |
+
%465 = icmp slt i32 %452, %455, !dbg !45
|
| 479 |
+
%466 = and i1 %465, %464, !dbg !46
|
| 480 |
+
%467 = or i1 %462, %466, !dbg !47
|
| 481 |
+
%468 = select i1 %467, float %450, float %454, !dbg !48
|
| 482 |
+
%469 = select i1 %467, i32 %452, i32 %455, !dbg !49
|
| 483 |
+
%470 = bitcast float %468 to i32, !dbg !53
|
| 484 |
+
%471 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 1, i32 31), !dbg !53
|
| 485 |
+
%472 = bitcast i32 %471 to float, !dbg !53
|
| 486 |
+
%473 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %469, i32 1, i32 31), !dbg !53
|
| 487 |
+
%474 = fcmp ogt float %468, %472, !dbg !38
|
| 488 |
+
%475 = fcmp oeq float %468, %472, !dbg !50
|
| 489 |
+
%476 = fcmp uno float %468, 0.000000e+00, !dbg !40
|
| 490 |
+
%477 = fcmp uno float %472, 0.000000e+00, !dbg !41
|
| 491 |
+
%478 = xor i1 %477, true, !dbg !42
|
| 492 |
+
%479 = and i1 %476, %478, !dbg !43
|
| 493 |
+
%480 = or i1 %474, %479, !dbg !44
|
| 494 |
+
%481 = and i1 %477, %476, !dbg !51
|
| 495 |
+
%482 = or i1 %475, %481, !dbg !52
|
| 496 |
+
%483 = icmp slt i32 %469, %473, !dbg !45
|
| 497 |
+
%484 = and i1 %483, %482, !dbg !46
|
| 498 |
+
%485 = or i1 %480, %484, !dbg !47
|
| 499 |
+
%486 = select i1 %485, i32 %469, i32 %473, !dbg !49
|
| 500 |
+
%487 = and i32 %9, 995, !dbg !53
|
| 501 |
+
%488 = icmp eq i32 %487, 0, !dbg !53
|
| 502 |
+
%489 = select i1 %485, i32 %470, i32 %471, !dbg !48
|
| 503 |
+
%490 = insertelement <1 x i32> poison, i32 %489, i64 0, !dbg !53
|
| 504 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %448, <1 x i32> %490, i1 %488) #4, !dbg !53
|
| 505 |
+
%491 = insertelement <1 x i32> poison, i32 %486, i64 0, !dbg !53
|
| 506 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %451, <1 x i32> %491, i1 %488) #4, !dbg !53
|
| 507 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
|
| 508 |
+
%492 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %433, !dbg !53
|
| 509 |
+
%493 = load i32, ptr addrspace(3) %492, align 16, !dbg !53
|
| 510 |
+
%494 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %440, !dbg !53
|
| 511 |
+
%495 = load i32, ptr addrspace(3) %494, align 16, !dbg !53
|
| 512 |
+
%496 = sext i32 %137 to i64, !dbg !54
|
| 513 |
+
%497 = getelementptr i64, ptr addrspace(1) %1, i64 %496, !dbg !54
|
| 514 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55
|
| 515 |
+
%498 = lshr exact i32 %10, 4, !dbg !55
|
| 516 |
+
%499 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %498, !dbg !55
|
| 517 |
+
%500 = insertelement <2 x i32> poison, i32 %493, i64 0, !dbg !55
|
| 518 |
+
%501 = insertelement <2 x i32> %500, i32 %495, i64 1, !dbg !55
|
| 519 |
+
store <2 x i32> %501, ptr addrspace(3) %499, align 8, !dbg !55
|
| 520 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55
|
| 521 |
+
%502 = shl nuw nsw i32 %9, 3, !dbg !55
|
| 522 |
+
%503 = and i32 %502, 24, !dbg !55
|
| 523 |
+
%504 = and i32 %9, 4, !dbg !55
|
| 524 |
+
%505 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %503, !dbg !55
|
| 525 |
+
%506 = getelementptr inbounds nuw i8, ptr addrspace(3) %505, i32 %504, !dbg !55
|
| 526 |
+
%507 = load i32, ptr addrspace(3) %506, align 4, !dbg !55
|
| 527 |
+
%508 = sext i32 %507 to i64, !dbg !55
|
| 528 |
+
%509 = and i32 %9, 504, !dbg !55
|
| 529 |
+
%510 = icmp eq i32 %509, 0, !dbg !55
|
| 530 |
+
tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %508, ptr addrspace(1) %497, i1 %510) #4, !dbg !55
|
| 531 |
+
ret void, !dbg !56
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 535 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 536 |
+
|
| 537 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 538 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 539 |
+
|
| 540 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
| 541 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
| 542 |
+
|
| 543 |
+
; Function Attrs: convergent nocallback nounwind
|
| 544 |
+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
|
| 545 |
+
|
| 546 |
+
attributes #0 = { nounwind "nvvm.reqntid"="512" }
|
| 547 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 548 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
| 549 |
+
attributes #3 = { convergent nocallback nounwind }
|
| 550 |
+
attributes #4 = { nounwind }
|
| 551 |
+
|
| 552 |
+
!llvm.dbg.cu = !{!0}
|
| 553 |
+
!llvm.module.flags = !{!2, !3}
|
| 554 |
+
|
| 555 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 556 |
+
!1 = !DIFile(filename: "cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv")
|
| 557 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 558 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 559 |
+
!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 560 |
+
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
|
| 561 |
+
!6 = !{}
|
| 562 |
+
!7 = !DILocation(line: 23, column: 28, scope: !4)
|
| 563 |
+
!8 = !DILocation(line: 23, column: 33, scope: !4)
|
| 564 |
+
!9 = !DILocation(line: 24, column: 44, scope: !4)
|
| 565 |
+
!10 = !DILocation(line: 24, column: 23, scope: !4)
|
| 566 |
+
!11 = !DILocation(line: 26, column: 37, scope: !4)
|
| 567 |
+
!12 = !DILocation(line: 29, column: 19, scope: !4)
|
| 568 |
+
!13 = !DILocation(line: 33, column: 40, scope: !4)
|
| 569 |
+
!14 = !DILocation(line: 34, column: 31, scope: !4)
|
| 570 |
+
!15 = !DILocation(line: 35, column: 29, scope: !4)
|
| 571 |
+
!16 = !DILocation(line: 39, column: 52, scope: !4)
|
| 572 |
+
!17 = !DILocation(line: 39, column: 34, scope: !4)
|
| 573 |
+
!18 = !DILocation(line: 39, column: 66, scope: !4)
|
| 574 |
+
!19 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !22)
|
| 575 |
+
!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0)
|
| 576 |
+
!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
|
| 577 |
+
!22 = !DILocation(line: 42, column: 38, scope: !4)
|
| 578 |
+
!23 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !22)
|
| 579 |
+
!24 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !22)
|
| 580 |
+
!25 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !22)
|
| 581 |
+
!26 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !22)
|
| 582 |
+
!27 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !22)
|
| 583 |
+
!28 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !22)
|
| 584 |
+
!29 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !22)
|
| 585 |
+
!30 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !22)
|
| 586 |
+
!31 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !22)
|
| 587 |
+
!32 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !22)
|
| 588 |
+
!33 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !22)
|
| 589 |
+
!34 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !22)
|
| 590 |
+
!35 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !22)
|
| 591 |
+
!36 = !DILocation(line: 44, column: 46, scope: !4)
|
| 592 |
+
!37 = !DILocation(line: 45, column: 58, scope: !4)
|
| 593 |
+
!38 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !39)
|
| 594 |
+
!39 = !DILocation(line: 46, column: 75, scope: !4)
|
| 595 |
+
!40 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !39)
|
| 596 |
+
!41 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !39)
|
| 597 |
+
!42 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !39)
|
| 598 |
+
!43 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !39)
|
| 599 |
+
!44 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !39)
|
| 600 |
+
!45 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !39)
|
| 601 |
+
!46 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !39)
|
| 602 |
+
!47 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !39)
|
| 603 |
+
!48 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !39)
|
| 604 |
+
!49 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !39)
|
| 605 |
+
!50 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !39)
|
| 606 |
+
!51 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !39)
|
| 607 |
+
!52 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !39)
|
| 608 |
+
!53 = !DILocation(line: 165, column: 42, scope: !20, inlinedAt: !39)
|
| 609 |
+
!54 = !DILocation(line: 48, column: 25, scope: !4)
|
| 610 |
+
!55 = !DILocation(line: 48, column: 36, scope: !4)
|
| 611 |
+
!56 = !DILocation(line: 48, column: 4, scope: !4)
|
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ptx
ADDED
|
@@ -0,0 +1,1196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1
|
| 10 |
+
.extern .shared .align 16 .b8 global_smem[];
|
| 11 |
+
// @triton_red_fused_argmax_1
|
| 12 |
+
.visible .entry triton_red_fused_argmax_1(
|
| 13 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0,
|
| 14 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1,
|
| 15 |
+
.param .u32 triton_red_fused_argmax_1_param_2,
|
| 16 |
+
.param .u32 triton_red_fused_argmax_1_param_3,
|
| 17 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_4,
|
| 18 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_5
|
| 19 |
+
)
|
| 20 |
+
.reqntid 512
|
| 21 |
+
{
|
| 22 |
+
.reg .pred %p<325>;
|
| 23 |
+
.reg .b32 %r<226>;
|
| 24 |
+
.reg .b64 %rd<63>;
|
| 25 |
+
.loc 1 18 0 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:18:0
|
| 26 |
+
$L__func_begin0:
|
| 27 |
+
.loc 1 18 0 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:18:0
|
| 28 |
+
|
| 29 |
+
// %bb.0:
|
| 30 |
+
ld.param.b64 %rd15, [triton_red_fused_argmax_1_param_1];
|
| 31 |
+
ld.param.b64 %rd14, [triton_red_fused_argmax_1_param_0];
|
| 32 |
+
$L__tmp0:
|
| 33 |
+
.loc 1 23 28 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:23:28
|
| 34 |
+
mov.u32 %r27, %ctaid.x;
|
| 35 |
+
.loc 1 23 33 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:23:33
|
| 36 |
+
shl.b32 %r1, %r27, 3;
|
| 37 |
+
.loc 1 24 44 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:24:44
|
| 38 |
+
mov.u32 %r2, %tid.x;
|
| 39 |
+
and.b32 %r3, %r2, 384;
|
| 40 |
+
bfe.u32 %r28, %r2, 7, 2;
|
| 41 |
+
or.b32 %r4, %r28, 4;
|
| 42 |
+
.loc 1 24 23 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:24:23
|
| 43 |
+
or.b32 %r29, %r28, %r1;
|
| 44 |
+
or.b32 %r30, %r4, %r1;
|
| 45 |
+
.loc 1 26 37 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:26:37
|
| 46 |
+
shl.b32 %r31, %r2, 2;
|
| 47 |
+
and.b32 %r32, %r31, 508;
|
| 48 |
+
.loc 1 29 19 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:29:19
|
| 49 |
+
bfe.s32 %r33, %r27, 28, 1;
|
| 50 |
+
shr.u32 %r34, %r33, 21;
|
| 51 |
+
add.s32 %r35, %r29, %r34;
|
| 52 |
+
shr.s32 %r36, %r35, 11;
|
| 53 |
+
add.s32 %r37, %r30, %r34;
|
| 54 |
+
shr.s32 %r38, %r37, 11;
|
| 55 |
+
.loc 1 33 40 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:33:40
|
| 56 |
+
cvt.u64.u32 %rd1, %r32;
|
| 57 |
+
mul.lo.s32 %r39, %r27, 256000;
|
| 58 |
+
mad.lo.s32 %r40, %r38, 224000, %r39;
|
| 59 |
+
mul.lo.s32 %r41, %r28, 32000;
|
| 60 |
+
add.s32 %r42, %r40, %r41;
|
| 61 |
+
add.s32 %r43, %r42, %r32;
|
| 62 |
+
cvt.u64.u32 %rd2, %r43;
|
| 63 |
+
mad.lo.s32 %r44, %r36, 224000, %r39;
|
| 64 |
+
add.s32 %r45, %r44, %r41;
|
| 65 |
+
add.s32 %r46, %r45, %r32;
|
| 66 |
+
cvt.u64.u32 %rd3, %r46;
|
| 67 |
+
mov.b32 %r47, 0fFF800000;
|
| 68 |
+
mov.b64 %rd59, {%r47, %r47};
|
| 69 |
+
mov.b32 %r218, 2147483647;
|
| 70 |
+
mov.b64 %rd58, -512;
|
| 71 |
+
mov.b32 %r219, %r218;
|
| 72 |
+
mov.b64 %rd60, %rd59;
|
| 73 |
+
mov.b32 %r220, %r218;
|
| 74 |
+
mov.b32 %r221, %r218;
|
| 75 |
+
mov.b64 %rd61, %rd59;
|
| 76 |
+
mov.b64 %rd62, %rd59;
|
| 77 |
+
mov.b32 %r222, %r218;
|
| 78 |
+
mov.b32 %r223, %r218;
|
| 79 |
+
mov.b32 %r224, %r218;
|
| 80 |
+
mov.b32 %r225, %r218;
|
| 81 |
+
$L__BB0_1: // =>This Inner Loop Header: Depth=1
|
| 82 |
+
.loc 1 35 29 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:35:29
|
| 83 |
+
add.s64 %rd26, %rd1, %rd58;
|
| 84 |
+
add.s64 %rd27, %rd26, 512;
|
| 85 |
+
setp.lt.u64 %p1, %rd27, 32000;
|
| 86 |
+
.loc 1 39 52 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:52
|
| 87 |
+
add.s64 %rd28, %rd3, %rd58;
|
| 88 |
+
.loc 1 39 34 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:34
|
| 89 |
+
add.s64 %rd29, %rd2, %rd58;
|
| 90 |
+
cvt.u32.u64 %r64, %rd28;
|
| 91 |
+
add.s32 %r65, %r64, 512;
|
| 92 |
+
mad.wide.s32 %rd21, %r65, 4, %rd14;
|
| 93 |
+
cvt.u32.u64 %r66, %rd29;
|
| 94 |
+
add.s32 %r67, %r66, 128512;
|
| 95 |
+
mad.wide.s32 %rd24, %r67, 4, %rd14;
|
| 96 |
+
.loc 1 39 66 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:66
|
| 97 |
+
// begin inline asm
|
| 98 |
+
mov.u64 %rd20, 0x0;
|
| 99 |
+
createpolicy.fractional.L2::evict_first.b64 %rd20, 1.0;
|
| 100 |
+
// end inline asm
|
| 101 |
+
mov.b32 %r52, 0;
|
| 102 |
+
// begin inline asm
|
| 103 |
+
mov.u32 %r48, %r52;
|
| 104 |
+
mov.u32 %r49, %r52;
|
| 105 |
+
mov.u32 %r50, %r52;
|
| 106 |
+
mov.u32 %r51, %r52;
|
| 107 |
+
@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r48, %r49, %r50, %r51 }, [ %rd21 + 0 ], %rd20;
|
| 108 |
+
// end inline asm
|
| 109 |
+
// begin inline asm
|
| 110 |
+
mov.u64 %rd23, 0x0;
|
| 111 |
+
createpolicy.fractional.L2::evict_first.b64 %rd23, 1.0;
|
| 112 |
+
// end inline asm
|
| 113 |
+
// begin inline asm
|
| 114 |
+
mov.u32 %r56, %r52;
|
| 115 |
+
mov.u32 %r57, %r52;
|
| 116 |
+
mov.u32 %r58, %r52;
|
| 117 |
+
mov.u32 %r59, %r52;
|
| 118 |
+
@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r56, %r57, %r58, %r59 }, [ %rd24 + 0 ], %rd23;
|
| 119 |
+
// end inline asm
|
| 120 |
+
$L__tmp1:
|
| 121 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 122 |
+
mov.b64 {%r68, %r69}, %rd59;
|
| 123 |
+
setp.nan.f32 %p3, %r68, %r68;
|
| 124 |
+
setp.nan.f32 %p4, %r69, %r69;
|
| 125 |
+
mov.b64 {%r70, %r71}, %rd61;
|
| 126 |
+
setp.nan.f32 %p5, %r70, %r70;
|
| 127 |
+
setp.nan.f32 %p6, %r71, %r71;
|
| 128 |
+
mov.b64 {%r72, %r73}, %rd62;
|
| 129 |
+
setp.nan.f32 %p7, %r72, %r72;
|
| 130 |
+
setp.nan.f32 %p8, %r73, %r73;
|
| 131 |
+
mov.b64 {%r74, %r75}, %rd60;
|
| 132 |
+
setp.nan.f32 %p9, %r74, %r74;
|
| 133 |
+
setp.nan.f32 %p10, %r75, %r75;
|
| 134 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 135 |
+
cvt.s64.s32 %rd30, %r219;
|
| 136 |
+
cvt.s64.s32 %rd31, %r218;
|
| 137 |
+
cvt.s64.s32 %rd32, %r221;
|
| 138 |
+
cvt.s64.s32 %rd33, %r220;
|
| 139 |
+
$L__tmp2:
|
| 140 |
+
.loc 1 39 66 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:66
|
| 141 |
+
cvt.u64.u32 %rd34, %r48;
|
| 142 |
+
cvt.u64.u32 %rd35, %r49;
|
| 143 |
+
shl.b64 %rd36, %rd35, 32;
|
| 144 |
+
or.b64 %rd37, %rd34, %rd36;
|
| 145 |
+
$L__tmp3:
|
| 146 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 147 |
+
mov.b64 {%r76, %r77}, %rd37;
|
| 148 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 149 |
+
setp.gt.f32 %p11, %r69, %r77;
|
| 150 |
+
setp.gt.f32 %p12, %r68, %r76;
|
| 151 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 152 |
+
setp.eq.f32 %p13, %r68, %r76;
|
| 153 |
+
setp.eq.f32 %p14, %r69, %r77;
|
| 154 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 155 |
+
setp.nan.f32 %p15, %r77, %r77;
|
| 156 |
+
setp.nan.f32 %p16, %r76, %r76;
|
| 157 |
+
setp.num.f32 %p17, %r76, %r76;
|
| 158 |
+
setp.num.f32 %p18, %r77, %r77;
|
| 159 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 160 |
+
and.pred %p19, %p4, %p18;
|
| 161 |
+
and.pred %p20, %p3, %p17;
|
| 162 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 163 |
+
or.pred %p21, %p12, %p20;
|
| 164 |
+
or.pred %p22, %p11, %p19;
|
| 165 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 166 |
+
and.pred %p23, %p3, %p16;
|
| 167 |
+
and.pred %p24, %p4, %p15;
|
| 168 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 169 |
+
or.pred %p25, %p14, %p24;
|
| 170 |
+
or.pred %p26, %p13, %p23;
|
| 171 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 172 |
+
setp.gt.s64 %p27, %rd27, %rd31;
|
| 173 |
+
setp.ge.s64 %p28, %rd27, %rd30;
|
| 174 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 175 |
+
and.pred %p29, %p27, %p26;
|
| 176 |
+
and.pred %p30, %p28, %p25;
|
| 177 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 178 |
+
or.pred %p31, %p22, %p30;
|
| 179 |
+
or.pred %p32, %p21, %p29;
|
| 180 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 181 |
+
selp.f32 %r78, %r68, %r76, %p32;
|
| 182 |
+
selp.f32 %r79, %r69, %r77, %p31;
|
| 183 |
+
cvt.u32.u64 %r80, %rd27;
|
| 184 |
+
cvt.u32.u64 %r81, %rd26;
|
| 185 |
+
add.s32 %r82, %r81, 513;
|
| 186 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 187 |
+
selp.b32 %r83, %r218, %r80, %p32;
|
| 188 |
+
selp.b32 %r84, %r219, %r82, %p31;
|
| 189 |
+
$L__tmp4:
|
| 190 |
+
.loc 1 39 66 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:66
|
| 191 |
+
cvt.u64.u32 %rd38, %r56;
|
| 192 |
+
cvt.u64.u32 %rd39, %r57;
|
| 193 |
+
shl.b64 %rd40, %rd39, 32;
|
| 194 |
+
or.b64 %rd41, %rd38, %rd40;
|
| 195 |
+
$L__tmp5:
|
| 196 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 197 |
+
mov.b64 {%r85, %r86}, %rd41;
|
| 198 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 199 |
+
setp.gt.f32 %p33, %r75, %r86;
|
| 200 |
+
setp.gt.f32 %p34, %r74, %r85;
|
| 201 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 202 |
+
setp.eq.f32 %p35, %r74, %r85;
|
| 203 |
+
setp.eq.f32 %p36, %r75, %r86;
|
| 204 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 205 |
+
setp.nan.f32 %p37, %r86, %r86;
|
| 206 |
+
setp.nan.f32 %p38, %r85, %r85;
|
| 207 |
+
setp.num.f32 %p39, %r85, %r85;
|
| 208 |
+
setp.num.f32 %p40, %r86, %r86;
|
| 209 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 210 |
+
and.pred %p41, %p10, %p40;
|
| 211 |
+
and.pred %p42, %p9, %p39;
|
| 212 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 213 |
+
or.pred %p43, %p34, %p42;
|
| 214 |
+
or.pred %p44, %p33, %p41;
|
| 215 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 216 |
+
and.pred %p45, %p9, %p38;
|
| 217 |
+
and.pred %p46, %p10, %p37;
|
| 218 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 219 |
+
or.pred %p47, %p36, %p46;
|
| 220 |
+
or.pred %p48, %p35, %p45;
|
| 221 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 222 |
+
setp.gt.s64 %p49, %rd27, %rd33;
|
| 223 |
+
setp.ge.s64 %p50, %rd27, %rd32;
|
| 224 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 225 |
+
and.pred %p51, %p49, %p48;
|
| 226 |
+
and.pred %p52, %p50, %p47;
|
| 227 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 228 |
+
or.pred %p53, %p44, %p52;
|
| 229 |
+
or.pred %p54, %p43, %p51;
|
| 230 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 231 |
+
selp.f32 %r87, %r74, %r85, %p54;
|
| 232 |
+
selp.f32 %r88, %r75, %r86, %p53;
|
| 233 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 234 |
+
selp.b32 %r89, %r220, %r80, %p54;
|
| 235 |
+
selp.b32 %r90, %r221, %r82, %p53;
|
| 236 |
+
$L__tmp6:
|
| 237 |
+
.loc 1 34 31 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:34:31
|
| 238 |
+
add.s64 %rd42, %rd26, 515;
|
| 239 |
+
add.s64 %rd43, %rd26, 514;
|
| 240 |
+
.loc 1 39 66 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:66
|
| 241 |
+
cvt.u64.u32 %rd44, %r59;
|
| 242 |
+
cvt.u64.u32 %rd45, %r58;
|
| 243 |
+
shl.b64 %rd46, %rd45, 32;
|
| 244 |
+
or.b64 %rd47, %rd44, %rd46;
|
| 245 |
+
cvt.u64.u32 %rd48, %r51;
|
| 246 |
+
cvt.u64.u32 %rd49, %r50;
|
| 247 |
+
shl.b64 %rd50, %rd49, 32;
|
| 248 |
+
or.b64 %rd51, %rd48, %rd50;
|
| 249 |
+
$L__tmp7:
|
| 250 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 251 |
+
mov.b64 {%r91, %r92}, %rd51;
|
| 252 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 253 |
+
setp.gt.f32 %p55, %r73, %r92;
|
| 254 |
+
setp.gt.f32 %p56, %r72, %r91;
|
| 255 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 256 |
+
mov.b64 {%r93, %r94}, %rd47;
|
| 257 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 258 |
+
setp.gt.f32 %p57, %r71, %r94;
|
| 259 |
+
setp.gt.f32 %p58, %r70, %r93;
|
| 260 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 261 |
+
setp.eq.f32 %p59, %r70, %r93;
|
| 262 |
+
setp.eq.f32 %p60, %r71, %r94;
|
| 263 |
+
setp.eq.f32 %p61, %r72, %r91;
|
| 264 |
+
setp.eq.f32 %p62, %r73, %r92;
|
| 265 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 266 |
+
setp.nan.f32 %p63, %r92, %r92;
|
| 267 |
+
setp.nan.f32 %p64, %r91, %r91;
|
| 268 |
+
setp.nan.f32 %p65, %r94, %r94;
|
| 269 |
+
setp.nan.f32 %p66, %r93, %r93;
|
| 270 |
+
setp.num.f32 %p67, %r93, %r93;
|
| 271 |
+
setp.num.f32 %p68, %r94, %r94;
|
| 272 |
+
setp.num.f32 %p69, %r91, %r91;
|
| 273 |
+
setp.num.f32 %p70, %r92, %r92;
|
| 274 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 275 |
+
and.pred %p71, %p8, %p70;
|
| 276 |
+
and.pred %p72, %p7, %p69;
|
| 277 |
+
and.pred %p73, %p6, %p68;
|
| 278 |
+
and.pred %p74, %p5, %p67;
|
| 279 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 280 |
+
or.pred %p75, %p58, %p74;
|
| 281 |
+
or.pred %p76, %p57, %p73;
|
| 282 |
+
or.pred %p77, %p56, %p72;
|
| 283 |
+
or.pred %p78, %p55, %p71;
|
| 284 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 285 |
+
and.pred %p79, %p5, %p66;
|
| 286 |
+
and.pred %p80, %p6, %p65;
|
| 287 |
+
and.pred %p81, %p7, %p64;
|
| 288 |
+
and.pred %p82, %p8, %p63;
|
| 289 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 290 |
+
or.pred %p83, %p62, %p82;
|
| 291 |
+
or.pred %p84, %p61, %p81;
|
| 292 |
+
or.pred %p85, %p60, %p80;
|
| 293 |
+
or.pred %p86, %p59, %p79;
|
| 294 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 295 |
+
cvt.s64.s32 %rd52, %r222;
|
| 296 |
+
cvt.s64.s32 %rd53, %r223;
|
| 297 |
+
cvt.s64.s32 %rd54, %r224;
|
| 298 |
+
cvt.s64.s32 %rd55, %r225;
|
| 299 |
+
setp.gt.s64 %p87, %rd43, %rd55;
|
| 300 |
+
setp.gt.s64 %p88, %rd42, %rd54;
|
| 301 |
+
setp.gt.s64 %p89, %rd43, %rd53;
|
| 302 |
+
setp.gt.s64 %p90, %rd42, %rd52;
|
| 303 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 304 |
+
and.pred %p91, %p90, %p86;
|
| 305 |
+
and.pred %p92, %p89, %p85;
|
| 306 |
+
and.pred %p93, %p88, %p84;
|
| 307 |
+
and.pred %p94, %p87, %p83;
|
| 308 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 309 |
+
or.pred %p95, %p78, %p94;
|
| 310 |
+
or.pred %p96, %p77, %p93;
|
| 311 |
+
or.pred %p97, %p76, %p92;
|
| 312 |
+
or.pred %p98, %p75, %p91;
|
| 313 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 314 |
+
selp.f32 %r95, %r70, %r93, %p98;
|
| 315 |
+
selp.f32 %r96, %r71, %r94, %p97;
|
| 316 |
+
selp.f32 %r97, %r72, %r91, %p96;
|
| 317 |
+
selp.f32 %r98, %r73, %r92, %p95;
|
| 318 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ]
|
| 319 |
+
cvt.u32.u64 %r99, %rd43;
|
| 320 |
+
cvt.u32.u64 %r100, %rd42;
|
| 321 |
+
selp.b32 %r101, %r222, %r100, %p98;
|
| 322 |
+
selp.b32 %r102, %r223, %r99, %p97;
|
| 323 |
+
selp.b32 %r103, %r224, %r100, %p96;
|
| 324 |
+
selp.b32 %r104, %r225, %r99, %p95;
|
| 325 |
+
$L__tmp8:
|
| 326 |
+
.loc 1 44 46 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:44:46
|
| 327 |
+
selp.f32 %r105, %r79, %r69, %p1;
|
| 328 |
+
selp.f32 %r106, %r78, %r68, %p1;
|
| 329 |
+
mov.b64 %rd59, {%r106, %r105};
|
| 330 |
+
selp.f32 %r107, %r88, %r75, %p1;
|
| 331 |
+
selp.f32 %r108, %r87, %r74, %p1;
|
| 332 |
+
mov.b64 %rd60, {%r108, %r107};
|
| 333 |
+
selp.f32 %r109, %r98, %r73, %p1;
|
| 334 |
+
selp.f32 %r110, %r97, %r72, %p1;
|
| 335 |
+
mov.b64 %rd62, {%r110, %r109};
|
| 336 |
+
selp.f32 %r111, %r96, %r71, %p1;
|
| 337 |
+
selp.f32 %r112, %r95, %r70, %p1;
|
| 338 |
+
mov.b64 %rd61, {%r112, %r111};
|
| 339 |
+
.loc 1 45 58 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:45:58
|
| 340 |
+
selp.b32 %r219, %r84, %r219, %p1;
|
| 341 |
+
selp.b32 %r218, %r83, %r218, %p1;
|
| 342 |
+
selp.b32 %r221, %r90, %r221, %p1;
|
| 343 |
+
selp.b32 %r220, %r89, %r220, %p1;
|
| 344 |
+
selp.b32 %r225, %r104, %r225, %p1;
|
| 345 |
+
selp.b32 %r224, %r103, %r224, %p1;
|
| 346 |
+
selp.b32 %r223, %r102, %r223, %p1;
|
| 347 |
+
selp.b32 %r222, %r101, %r222, %p1;
|
| 348 |
+
.loc 1 33 40 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:33:40
|
| 349 |
+
add.s64 %rd58, %rd58, 512;
|
| 350 |
+
setp.lt.u64 %p99, %rd58, 31488;
|
| 351 |
+
@%p99 bra $L__BB0_1;
|
| 352 |
+
// %bb.2:
|
| 353 |
+
.loc 1 24 44 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:24:44
|
| 354 |
+
and.b32 %r129, %r2, 7;
|
| 355 |
+
.loc 1 24 23 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:24:23
|
| 356 |
+
or.b32 %r130, %r1, %r129;
|
| 357 |
+
.loc 1 24 44 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:24:44
|
| 358 |
+
and.b32 %r131, %r2, 31;
|
| 359 |
+
$L__tmp9:
|
| 360 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 361 |
+
mov.b64 {%r132, %r133}, %rd59;
|
| 362 |
+
setp.gt.f32 %p109, %r132, %r133;
|
| 363 |
+
setp.eq.f32 %p110, %r133, %r132;
|
| 364 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 365 |
+
setp.nan.f32 %p111, %r132, %r132;
|
| 366 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 367 |
+
setp.num.f32 %p112, %r133, %r133;
|
| 368 |
+
setp.nan.f32 %p113, %r133, %r133;
|
| 369 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 370 |
+
and.pred %p114, %p111, %p113;
|
| 371 |
+
and.pred %p115, %p111, %p112;
|
| 372 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 373 |
+
or.pred %p116, %p109, %p115;
|
| 374 |
+
or.pred %p117, %p110, %p114;
|
| 375 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 376 |
+
setp.lt.s32 %p118, %r218, %r219;
|
| 377 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 378 |
+
and.pred %p119, %p118, %p117;
|
| 379 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 380 |
+
or.pred %p120, %p116, %p119;
|
| 381 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 382 |
+
selp.f32 %r134, %r132, %r133, %p120;
|
| 383 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 384 |
+
selp.b32 %r135, %r218, %r219, %p120;
|
| 385 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 386 |
+
mov.b64 {%r136, %r137}, %rd62;
|
| 387 |
+
setp.gt.f32 %p121, %r134, %r137;
|
| 388 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 389 |
+
setp.eq.f32 %p122, %r134, %r137;
|
| 390 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 391 |
+
setp.nan.f32 %p123, %r134, %r134;
|
| 392 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 393 |
+
mov.b64 {%r138, %r139}, %rd61;
|
| 394 |
+
setp.nan.f32 %p124, %r138, %r138;
|
| 395 |
+
setp.num.f32 %p125, %r138, %r138;
|
| 396 |
+
setp.nan.f32 %p126, %r139, %r139;
|
| 397 |
+
setp.num.f32 %p127, %r139, %r139;
|
| 398 |
+
setp.nan.f32 %p128, %r136, %r136;
|
| 399 |
+
setp.num.f32 %p129, %r136, %r136;
|
| 400 |
+
setp.nan.f32 %p130, %r137, %r137;
|
| 401 |
+
setp.num.f32 %p131, %r137, %r137;
|
| 402 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 403 |
+
and.pred %p132, %p123, %p131;
|
| 404 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 405 |
+
or.pred %p133, %p121, %p132;
|
| 406 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 407 |
+
and.pred %p134, %p130, %p123;
|
| 408 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 409 |
+
or.pred %p135, %p122, %p134;
|
| 410 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 411 |
+
setp.lt.s32 %p136, %r135, %r225;
|
| 412 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 413 |
+
and.pred %p137, %p136, %p135;
|
| 414 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 415 |
+
or.pred %p138, %p133, %p137;
|
| 416 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 417 |
+
selp.f32 %r140, %r134, %r137, %p138;
|
| 418 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 419 |
+
selp.b32 %r141, %r135, %r225, %p138;
|
| 420 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 421 |
+
setp.gt.f32 %p139, %r140, %r136;
|
| 422 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 423 |
+
setp.eq.f32 %p140, %r140, %r136;
|
| 424 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 425 |
+
setp.nan.f32 %p141, %r140, %r140;
|
| 426 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 427 |
+
and.pred %p142, %p141, %p129;
|
| 428 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 429 |
+
or.pred %p143, %p139, %p142;
|
| 430 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 431 |
+
and.pred %p144, %p128, %p141;
|
| 432 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 433 |
+
or.pred %p145, %p140, %p144;
|
| 434 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 435 |
+
setp.lt.s32 %p146, %r141, %r224;
|
| 436 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 437 |
+
and.pred %p147, %p146, %p145;
|
| 438 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 439 |
+
or.pred %p148, %p143, %p147;
|
| 440 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 441 |
+
selp.f32 %r142, %r140, %r136, %p148;
|
| 442 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 443 |
+
selp.b32 %r143, %r141, %r224, %p148;
|
| 444 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 445 |
+
mov.b64 {%r144, %r145}, %rd60;
|
| 446 |
+
setp.gt.f32 %p149, %r144, %r145;
|
| 447 |
+
setp.eq.f32 %p150, %r145, %r144;
|
| 448 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 449 |
+
setp.nan.f32 %p151, %r144, %r144;
|
| 450 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 451 |
+
setp.num.f32 %p152, %r145, %r145;
|
| 452 |
+
setp.nan.f32 %p153, %r145, %r145;
|
| 453 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 454 |
+
and.pred %p154, %p151, %p153;
|
| 455 |
+
and.pred %p155, %p151, %p152;
|
| 456 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 457 |
+
or.pred %p156, %p149, %p155;
|
| 458 |
+
or.pred %p157, %p150, %p154;
|
| 459 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 460 |
+
setp.lt.s32 %p158, %r220, %r221;
|
| 461 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 462 |
+
and.pred %p159, %p158, %p157;
|
| 463 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 464 |
+
or.pred %p160, %p156, %p159;
|
| 465 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 466 |
+
selp.f32 %r146, %r144, %r145, %p160;
|
| 467 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 468 |
+
selp.b32 %r147, %r220, %r221, %p160;
|
| 469 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 470 |
+
setp.gt.f32 %p161, %r146, %r139;
|
| 471 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 472 |
+
setp.eq.f32 %p162, %r146, %r139;
|
| 473 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 474 |
+
setp.nan.f32 %p163, %r146, %r146;
|
| 475 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 476 |
+
and.pred %p164, %p163, %p127;
|
| 477 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 478 |
+
or.pred %p165, %p161, %p164;
|
| 479 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 480 |
+
and.pred %p166, %p126, %p163;
|
| 481 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 482 |
+
or.pred %p167, %p162, %p166;
|
| 483 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 484 |
+
setp.lt.s32 %p168, %r147, %r223;
|
| 485 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 486 |
+
and.pred %p169, %p168, %p167;
|
| 487 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 488 |
+
or.pred %p170, %p165, %p169;
|
| 489 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 490 |
+
selp.f32 %r148, %r146, %r139, %p170;
|
| 491 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 492 |
+
selp.b32 %r149, %r147, %r223, %p170;
|
| 493 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 494 |
+
setp.gt.f32 %p171, %r148, %r138;
|
| 495 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 496 |
+
setp.eq.f32 %p172, %r148, %r138;
|
| 497 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 498 |
+
setp.nan.f32 %p173, %r148, %r148;
|
| 499 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 500 |
+
and.pred %p174, %p173, %p125;
|
| 501 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 502 |
+
or.pred %p175, %p171, %p174;
|
| 503 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 504 |
+
and.pred %p176, %p124, %p173;
|
| 505 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 506 |
+
or.pred %p177, %p172, %p176;
|
| 507 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 508 |
+
setp.lt.s32 %p178, %r149, %r222;
|
| 509 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 510 |
+
and.pred %p179, %p178, %p177;
|
| 511 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 512 |
+
or.pred %p180, %p175, %p179;
|
| 513 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 514 |
+
selp.f32 %r150, %r148, %r138, %p180;
|
| 515 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 516 |
+
selp.b32 %r151, %r149, %r222, %p180;
|
| 517 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 518 |
+
shfl.sync.bfly.b32 %r152, %r142, 16, 31, -1;
|
| 519 |
+
shfl.sync.bfly.b32 %r153, %r143, 16, 31, -1;
|
| 520 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 521 |
+
setp.gt.f32 %p181, %r142, %r152;
|
| 522 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 523 |
+
setp.eq.f32 %p182, %r142, %r152;
|
| 524 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 525 |
+
setp.nan.f32 %p183, %r142, %r142;
|
| 526 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 527 |
+
setp.nan.f32 %p184, %r152, %r152;
|
| 528 |
+
setp.num.f32 %p185, %r152, %r152;
|
| 529 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 530 |
+
and.pred %p186, %p183, %p185;
|
| 531 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 532 |
+
or.pred %p187, %p181, %p186;
|
| 533 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 534 |
+
and.pred %p188, %p183, %p184;
|
| 535 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 536 |
+
or.pred %p189, %p182, %p188;
|
| 537 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 538 |
+
setp.lt.s32 %p190, %r143, %r153;
|
| 539 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 540 |
+
and.pred %p191, %p190, %p189;
|
| 541 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 542 |
+
or.pred %p192, %p187, %p191;
|
| 543 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 544 |
+
selp.f32 %r154, %r142, %r152, %p192;
|
| 545 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 546 |
+
selp.b32 %r155, %r143, %r153, %p192;
|
| 547 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 548 |
+
shfl.sync.bfly.b32 %r156, %r154, 8, 31, -1;
|
| 549 |
+
shfl.sync.bfly.b32 %r157, %r155, 8, 31, -1;
|
| 550 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 551 |
+
setp.gt.f32 %p193, %r154, %r156;
|
| 552 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 553 |
+
setp.eq.f32 %p194, %r154, %r156;
|
| 554 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 555 |
+
setp.nan.f32 %p195, %r154, %r154;
|
| 556 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 557 |
+
setp.nan.f32 %p196, %r156, %r156;
|
| 558 |
+
setp.num.f32 %p197, %r156, %r156;
|
| 559 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 560 |
+
and.pred %p198, %p195, %p197;
|
| 561 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 562 |
+
or.pred %p199, %p193, %p198;
|
| 563 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 564 |
+
and.pred %p200, %p196, %p195;
|
| 565 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 566 |
+
or.pred %p201, %p194, %p200;
|
| 567 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 568 |
+
setp.lt.s32 %p202, %r155, %r157;
|
| 569 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 570 |
+
and.pred %p203, %p202, %p201;
|
| 571 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 572 |
+
or.pred %p204, %p199, %p203;
|
| 573 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 574 |
+
selp.f32 %r158, %r154, %r156, %p204;
|
| 575 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 576 |
+
selp.b32 %r159, %r155, %r157, %p204;
|
| 577 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 578 |
+
shfl.sync.bfly.b32 %r160, %r158, 4, 31, -1;
|
| 579 |
+
shfl.sync.bfly.b32 %r161, %r159, 4, 31, -1;
|
| 580 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 581 |
+
setp.gt.f32 %p205, %r158, %r160;
|
| 582 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 583 |
+
setp.eq.f32 %p206, %r158, %r160;
|
| 584 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 585 |
+
setp.nan.f32 %p207, %r158, %r158;
|
| 586 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 587 |
+
setp.nan.f32 %p208, %r160, %r160;
|
| 588 |
+
setp.num.f32 %p209, %r160, %r160;
|
| 589 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 590 |
+
and.pred %p210, %p207, %p209;
|
| 591 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 592 |
+
or.pred %p211, %p205, %p210;
|
| 593 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 594 |
+
and.pred %p212, %p208, %p207;
|
| 595 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 596 |
+
or.pred %p213, %p206, %p212;
|
| 597 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 598 |
+
setp.lt.s32 %p214, %r159, %r161;
|
| 599 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 600 |
+
and.pred %p215, %p214, %p213;
|
| 601 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 602 |
+
or.pred %p216, %p211, %p215;
|
| 603 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 604 |
+
selp.f32 %r162, %r158, %r160, %p216;
|
| 605 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 606 |
+
selp.b32 %r163, %r159, %r161, %p216;
|
| 607 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 608 |
+
shfl.sync.bfly.b32 %r164, %r162, 2, 31, -1;
|
| 609 |
+
shfl.sync.bfly.b32 %r165, %r163, 2, 31, -1;
|
| 610 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 611 |
+
setp.gt.f32 %p217, %r162, %r164;
|
| 612 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 613 |
+
setp.eq.f32 %p218, %r162, %r164;
|
| 614 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 615 |
+
setp.nan.f32 %p219, %r162, %r162;
|
| 616 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 617 |
+
setp.nan.f32 %p220, %r164, %r164;
|
| 618 |
+
setp.num.f32 %p221, %r164, %r164;
|
| 619 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 620 |
+
and.pred %p222, %p219, %p221;
|
| 621 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 622 |
+
or.pred %p223, %p217, %p222;
|
| 623 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 624 |
+
and.pred %p224, %p220, %p219;
|
| 625 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 626 |
+
or.pred %p225, %p218, %p224;
|
| 627 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 628 |
+
setp.lt.s32 %p226, %r163, %r165;
|
| 629 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 630 |
+
and.pred %p227, %p226, %p225;
|
| 631 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 632 |
+
or.pred %p228, %p223, %p227;
|
| 633 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 634 |
+
selp.f32 %r166, %r162, %r164, %p228;
|
| 635 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 636 |
+
selp.b32 %r167, %r163, %r165, %p228;
|
| 637 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 638 |
+
shfl.sync.bfly.b32 %r168, %r166, 1, 31, -1;
|
| 639 |
+
shfl.sync.bfly.b32 %r169, %r167, 1, 31, -1;
|
| 640 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 641 |
+
setp.gt.f32 %p229, %r166, %r168;
|
| 642 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 643 |
+
setp.eq.f32 %p230, %r166, %r168;
|
| 644 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 645 |
+
setp.nan.f32 %p231, %r166, %r166;
|
| 646 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 647 |
+
setp.nan.f32 %p232, %r168, %r168;
|
| 648 |
+
setp.num.f32 %p233, %r168, %r168;
|
| 649 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 650 |
+
and.pred %p234, %p231, %p233;
|
| 651 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 652 |
+
or.pred %p235, %p229, %p234;
|
| 653 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 654 |
+
and.pred %p236, %p232, %p231;
|
| 655 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 656 |
+
or.pred %p237, %p230, %p236;
|
| 657 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 658 |
+
setp.lt.s32 %p238, %r167, %r169;
|
| 659 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 660 |
+
and.pred %p239, %p238, %p237;
|
| 661 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 662 |
+
or.pred %p240, %p235, %p239;
|
| 663 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 664 |
+
selp.b32 %r116, %r167, %r169, %p240;
|
| 665 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 666 |
+
shfl.sync.bfly.b32 %r170, %r150, 16, 31, -1;
|
| 667 |
+
shfl.sync.bfly.b32 %r171, %r151, 16, 31, -1;
|
| 668 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 669 |
+
setp.gt.f32 %p241, %r150, %r170;
|
| 670 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 671 |
+
setp.eq.f32 %p242, %r150, %r170;
|
| 672 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 673 |
+
setp.nan.f32 %p243, %r150, %r150;
|
| 674 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 675 |
+
setp.nan.f32 %p244, %r170, %r170;
|
| 676 |
+
setp.num.f32 %p245, %r170, %r170;
|
| 677 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 678 |
+
and.pred %p246, %p243, %p245;
|
| 679 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 680 |
+
or.pred %p247, %p241, %p246;
|
| 681 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 682 |
+
and.pred %p248, %p243, %p244;
|
| 683 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 684 |
+
or.pred %p249, %p242, %p248;
|
| 685 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 686 |
+
setp.lt.s32 %p250, %r151, %r171;
|
| 687 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 688 |
+
and.pred %p251, %p250, %p249;
|
| 689 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 690 |
+
or.pred %p252, %p247, %p251;
|
| 691 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 692 |
+
selp.f32 %r172, %r150, %r170, %p252;
|
| 693 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 694 |
+
selp.b32 %r173, %r151, %r171, %p252;
|
| 695 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 696 |
+
shfl.sync.bfly.b32 %r174, %r172, 8, 31, -1;
|
| 697 |
+
shfl.sync.bfly.b32 %r175, %r173, 8, 31, -1;
|
| 698 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 699 |
+
setp.gt.f32 %p253, %r172, %r174;
|
| 700 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 701 |
+
setp.eq.f32 %p254, %r172, %r174;
|
| 702 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 703 |
+
setp.nan.f32 %p255, %r172, %r172;
|
| 704 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 705 |
+
setp.nan.f32 %p256, %r174, %r174;
|
| 706 |
+
setp.num.f32 %p257, %r174, %r174;
|
| 707 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 708 |
+
and.pred %p258, %p255, %p257;
|
| 709 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 710 |
+
or.pred %p259, %p253, %p258;
|
| 711 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 712 |
+
and.pred %p260, %p256, %p255;
|
| 713 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 714 |
+
or.pred %p261, %p254, %p260;
|
| 715 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 716 |
+
setp.lt.s32 %p262, %r173, %r175;
|
| 717 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 718 |
+
and.pred %p263, %p262, %p261;
|
| 719 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 720 |
+
or.pred %p264, %p259, %p263;
|
| 721 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 722 |
+
selp.f32 %r176, %r172, %r174, %p264;
|
| 723 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 724 |
+
selp.b32 %r177, %r173, %r175, %p264;
|
| 725 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 726 |
+
shfl.sync.bfly.b32 %r178, %r176, 4, 31, -1;
|
| 727 |
+
shfl.sync.bfly.b32 %r179, %r177, 4, 31, -1;
|
| 728 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 729 |
+
setp.gt.f32 %p265, %r176, %r178;
|
| 730 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 731 |
+
setp.eq.f32 %p266, %r176, %r178;
|
| 732 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 733 |
+
setp.nan.f32 %p267, %r176, %r176;
|
| 734 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 735 |
+
setp.nan.f32 %p268, %r178, %r178;
|
| 736 |
+
setp.num.f32 %p269, %r178, %r178;
|
| 737 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 738 |
+
and.pred %p270, %p267, %p269;
|
| 739 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 740 |
+
or.pred %p271, %p265, %p270;
|
| 741 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 742 |
+
and.pred %p272, %p268, %p267;
|
| 743 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 744 |
+
or.pred %p273, %p266, %p272;
|
| 745 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 746 |
+
setp.lt.s32 %p274, %r177, %r179;
|
| 747 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 748 |
+
and.pred %p275, %p274, %p273;
|
| 749 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 750 |
+
or.pred %p276, %p271, %p275;
|
| 751 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 752 |
+
selp.f32 %r180, %r176, %r178, %p276;
|
| 753 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 754 |
+
selp.b32 %r181, %r177, %r179, %p276;
|
| 755 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 756 |
+
shfl.sync.bfly.b32 %r182, %r180, 2, 31, -1;
|
| 757 |
+
shfl.sync.bfly.b32 %r183, %r181, 2, 31, -1;
|
| 758 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 759 |
+
setp.gt.f32 %p277, %r180, %r182;
|
| 760 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 761 |
+
setp.eq.f32 %p278, %r180, %r182;
|
| 762 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 763 |
+
setp.nan.f32 %p279, %r180, %r180;
|
| 764 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 765 |
+
setp.nan.f32 %p280, %r182, %r182;
|
| 766 |
+
setp.num.f32 %p281, %r182, %r182;
|
| 767 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 768 |
+
and.pred %p282, %p279, %p281;
|
| 769 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 770 |
+
or.pred %p283, %p277, %p282;
|
| 771 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 772 |
+
and.pred %p284, %p280, %p279;
|
| 773 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 774 |
+
or.pred %p285, %p278, %p284;
|
| 775 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 776 |
+
setp.lt.s32 %p286, %r181, %r183;
|
| 777 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 778 |
+
and.pred %p287, %p286, %p285;
|
| 779 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 780 |
+
or.pred %p288, %p283, %p287;
|
| 781 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 782 |
+
selp.f32 %r184, %r180, %r182, %p288;
|
| 783 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 784 |
+
selp.b32 %r185, %r181, %r183, %p288;
|
| 785 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 786 |
+
shfl.sync.bfly.b32 %r186, %r184, 1, 31, -1;
|
| 787 |
+
shfl.sync.bfly.b32 %r187, %r185, 1, 31, -1;
|
| 788 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 789 |
+
setp.gt.f32 %p289, %r184, %r186;
|
| 790 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 791 |
+
setp.eq.f32 %p290, %r184, %r186;
|
| 792 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 793 |
+
setp.nan.f32 %p291, %r184, %r184;
|
| 794 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 795 |
+
setp.nan.f32 %p292, %r186, %r186;
|
| 796 |
+
setp.num.f32 %p293, %r186, %r186;
|
| 797 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 798 |
+
and.pred %p294, %p291, %p293;
|
| 799 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 800 |
+
or.pred %p295, %p289, %p294;
|
| 801 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 802 |
+
and.pred %p296, %p292, %p291;
|
| 803 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 804 |
+
or.pred %p297, %p290, %p296;
|
| 805 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 806 |
+
setp.lt.s32 %p298, %r185, %r187;
|
| 807 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 808 |
+
and.pred %p299, %p298, %p297;
|
| 809 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 810 |
+
or.pred %p300, %p295, %p299;
|
| 811 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 812 |
+
selp.b32 %r120, %r185, %r187, %p300;
|
| 813 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 814 |
+
bfe.u32 %r188, %r2, 5, 2;
|
| 815 |
+
setp.eq.b32 %p100, %r131, 0;
|
| 816 |
+
shr.u32 %r189, %r3, 5;
|
| 817 |
+
or.b32 %r190, %r189, %r188;
|
| 818 |
+
shl.b32 %r191, %r190, 2;
|
| 819 |
+
mov.b32 %r192, global_smem;
|
| 820 |
+
add.s32 %r113, %r192, %r191;
|
| 821 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 822 |
+
selp.b32 %r114, %r166, %r168, %p240;
|
| 823 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 824 |
+
// begin inline asm
|
| 825 |
+
@%p100 st.shared.b32 [ %r113 + 0 ], %r114;
|
| 826 |
+
// end inline asm
|
| 827 |
+
add.s32 %r193, %r192, 128;
|
| 828 |
+
add.s32 %r115, %r193, %r191;
|
| 829 |
+
// begin inline asm
|
| 830 |
+
@%p100 st.shared.b32 [ %r115 + 0 ], %r116;
|
| 831 |
+
// end inline asm
|
| 832 |
+
shl.b32 %r194, %r188, 2;
|
| 833 |
+
shl.b32 %r195, %r4, 4;
|
| 834 |
+
or.b32 %r196, %r195, %r194;
|
| 835 |
+
add.s32 %r117, %r192, %r196;
|
| 836 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 837 |
+
selp.b32 %r118, %r184, %r186, %p300;
|
| 838 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 839 |
+
// begin inline asm
|
| 840 |
+
@%p100 st.shared.b32 [ %r117 + 0 ], %r118;
|
| 841 |
+
// end inline asm
|
| 842 |
+
add.s32 %r119, %r193, %r196;
|
| 843 |
+
// begin inline asm
|
| 844 |
+
@%p100 st.shared.b32 [ %r119 + 0 ], %r120;
|
| 845 |
+
// end inline asm
|
| 846 |
+
bar.sync 0;
|
| 847 |
+
setp.lt.u32 %p104, %r2, 32;
|
| 848 |
+
add.s32 %r122, %r192, %r31;
|
| 849 |
+
// begin inline asm
|
| 850 |
+
@%p104 ld.shared.b32 %r121, [ %r122 + 0 ];
|
| 851 |
+
// end inline asm
|
| 852 |
+
add.s32 %r124, %r193, %r31;
|
| 853 |
+
// begin inline asm
|
| 854 |
+
@%p104 ld.shared.b32 %r123, [ %r124 + 0 ];
|
| 855 |
+
// end inline asm
|
| 856 |
+
shfl.sync.bfly.b32 %r198, %r121, 2, 31, -1;
|
| 857 |
+
shfl.sync.bfly.b32 %r199, %r123, 2, 31, -1;
|
| 858 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 859 |
+
setp.gt.f32 %p301, %r121, %r198;
|
| 860 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 861 |
+
setp.eq.f32 %p302, %r121, %r198;
|
| 862 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 863 |
+
setp.nan.f32 %p303, %r121, %r121;
|
| 864 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 865 |
+
setp.nan.f32 %p304, %r198, %r198;
|
| 866 |
+
setp.num.f32 %p305, %r198, %r198;
|
| 867 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 868 |
+
and.pred %p306, %p303, %p305;
|
| 869 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 870 |
+
or.pred %p307, %p301, %p306;
|
| 871 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 872 |
+
and.pred %p308, %p303, %p304;
|
| 873 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 874 |
+
or.pred %p309, %p302, %p308;
|
| 875 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 876 |
+
setp.lt.s32 %p310, %r123, %r199;
|
| 877 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 878 |
+
and.pred %p311, %p310, %p309;
|
| 879 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 880 |
+
or.pred %p312, %p307, %p311;
|
| 881 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 882 |
+
selp.f32 %r200, %r121, %r198, %p312;
|
| 883 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 884 |
+
selp.b32 %r201, %r123, %r199, %p312;
|
| 885 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 886 |
+
shfl.sync.bfly.b32 %r202, %r200, 1, 31, -1;
|
| 887 |
+
shfl.sync.bfly.b32 %r203, %r201, 1, 31, -1;
|
| 888 |
+
.loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 889 |
+
setp.gt.f32 %p313, %r200, %r202;
|
| 890 |
+
.loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 891 |
+
setp.eq.f32 %p314, %r200, %r202;
|
| 892 |
+
.loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 893 |
+
setp.nan.f32 %p315, %r200, %r200;
|
| 894 |
+
.loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 895 |
+
setp.nan.f32 %p316, %r202, %r202;
|
| 896 |
+
setp.num.f32 %p317, %r202, %r202;
|
| 897 |
+
.loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 898 |
+
and.pred %p318, %p315, %p317;
|
| 899 |
+
.loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 900 |
+
or.pred %p319, %p313, %p318;
|
| 901 |
+
.loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 902 |
+
and.pred %p320, %p316, %p315;
|
| 903 |
+
.loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 904 |
+
or.pred %p321, %p314, %p320;
|
| 905 |
+
.loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 906 |
+
setp.lt.s32 %p322, %r201, %r203;
|
| 907 |
+
.loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 908 |
+
and.pred %p323, %p322, %p321;
|
| 909 |
+
.loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 910 |
+
or.pred %p324, %p319, %p323;
|
| 911 |
+
.loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 912 |
+
selp.b32 %r128, %r201, %r203, %p324;
|
| 913 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 914 |
+
and.b32 %r204, %r2, 995;
|
| 915 |
+
setp.eq.b32 %p106, %r204, 0;
|
| 916 |
+
.loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 917 |
+
selp.b32 %r126, %r200, %r202, %p324;
|
| 918 |
+
.loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ]
|
| 919 |
+
// begin inline asm
|
| 920 |
+
@%p106 st.shared.b32 [ %r122 + 0 ], %r126;
|
| 921 |
+
// end inline asm
|
| 922 |
+
// begin inline asm
|
| 923 |
+
@%p106 st.shared.b32 [ %r124 + 0 ], %r128;
|
| 924 |
+
// end inline asm
|
| 925 |
+
bar.sync 0;
|
| 926 |
+
shr.u32 %r205, %r3, 3;
|
| 927 |
+
add.s32 %r206, %r193, %r205;
|
| 928 |
+
ld.shared.b32 %r207, [%r206];
|
| 929 |
+
add.s32 %r208, %r193, %r195;
|
| 930 |
+
ld.shared.b32 %r209, [%r208];
|
| 931 |
+
$L__tmp10:
|
| 932 |
+
.loc 1 48 25 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:48:25
|
| 933 |
+
mad.wide.s32 %rd57, %r130, 8, %rd15;
|
| 934 |
+
.loc 1 48 36 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:48:36
|
| 935 |
+
bar.sync 0;
|
| 936 |
+
shr.u32 %r210, %r3, 4;
|
| 937 |
+
add.s32 %r211, %r192, %r210;
|
| 938 |
+
st.shared.v2.b32 [%r211], {%r207, %r209};
|
| 939 |
+
bar.sync 0;
|
| 940 |
+
shl.b32 %r212, %r2, 3;
|
| 941 |
+
and.b32 %r213, %r212, 24;
|
| 942 |
+
and.b32 %r214, %r2, 4;
|
| 943 |
+
add.s32 %r215, %r192, %r213;
|
| 944 |
+
add.s32 %r216, %r215, %r214;
|
| 945 |
+
ld.shared.s32 %rd56, [%r216];
|
| 946 |
+
and.b32 %r217, %r2, 504;
|
| 947 |
+
setp.eq.b32 %p108, %r217, 0;
|
| 948 |
+
// begin inline asm
|
| 949 |
+
@%p108 st.global.b64 [ %rd57 + 0 ], { %rd56 };
|
| 950 |
+
// end inline asm
|
| 951 |
+
.loc 1 48 4 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:48:4
|
| 952 |
+
ret;
|
| 953 |
+
$L__tmp11:
|
| 954 |
+
$L__func_end0:
|
| 955 |
+
// -- End function
|
| 956 |
+
}
|
| 957 |
+
.file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py"
|
| 958 |
+
.file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
|
| 959 |
+
.section .debug_abbrev
|
| 960 |
+
{
|
| 961 |
+
.b8 1 // Abbreviation Code
|
| 962 |
+
.b8 17 // DW_TAG_compile_unit
|
| 963 |
+
.b8 1 // DW_CHILDREN_yes
|
| 964 |
+
.b8 37 // DW_AT_producer
|
| 965 |
+
.b8 8 // DW_FORM_string
|
| 966 |
+
.b8 19 // DW_AT_language
|
| 967 |
+
.b8 5 // DW_FORM_data2
|
| 968 |
+
.b8 3 // DW_AT_name
|
| 969 |
+
.b8 8 // DW_FORM_string
|
| 970 |
+
.b8 16 // DW_AT_stmt_list
|
| 971 |
+
.b8 6 // DW_FORM_data4
|
| 972 |
+
.b8 27 // DW_AT_comp_dir
|
| 973 |
+
.b8 8 // DW_FORM_string
|
| 974 |
+
.b8 0 // EOM(1)
|
| 975 |
+
.b8 0 // EOM(2)
|
| 976 |
+
.b8 2 // Abbreviation Code
|
| 977 |
+
.b8 46 // DW_TAG_subprogram
|
| 978 |
+
.b8 0 // DW_CHILDREN_no
|
| 979 |
+
.b8 3 // DW_AT_name
|
| 980 |
+
.b8 8 // DW_FORM_string
|
| 981 |
+
.b8 32 // DW_AT_inline
|
| 982 |
+
.b8 11 // DW_FORM_data1
|
| 983 |
+
.b8 0 // EOM(1)
|
| 984 |
+
.b8 0 // EOM(2)
|
| 985 |
+
.b8 3 // Abbreviation Code
|
| 986 |
+
.b8 46 // DW_TAG_subprogram
|
| 987 |
+
.b8 1 // DW_CHILDREN_yes
|
| 988 |
+
.b8 17 // DW_AT_low_pc
|
| 989 |
+
.b8 1 // DW_FORM_addr
|
| 990 |
+
.b8 18 // DW_AT_high_pc
|
| 991 |
+
.b8 1 // DW_FORM_addr
|
| 992 |
+
.b8 49 // DW_AT_abstract_origin
|
| 993 |
+
.b8 19 // DW_FORM_ref4
|
| 994 |
+
.b8 0 // EOM(1)
|
| 995 |
+
.b8 0 // EOM(2)
|
| 996 |
+
.b8 4 // Abbreviation Code
|
| 997 |
+
.b8 29 // DW_TAG_inlined_subroutine
|
| 998 |
+
.b8 0 // DW_CHILDREN_no
|
| 999 |
+
.b8 49 // DW_AT_abstract_origin
|
| 1000 |
+
.b8 19 // DW_FORM_ref4
|
| 1001 |
+
.b8 17 // DW_AT_low_pc
|
| 1002 |
+
.b8 1 // DW_FORM_addr
|
| 1003 |
+
.b8 18 // DW_AT_high_pc
|
| 1004 |
+
.b8 1 // DW_FORM_addr
|
| 1005 |
+
.b8 88 // DW_AT_call_file
|
| 1006 |
+
.b8 11 // DW_FORM_data1
|
| 1007 |
+
.b8 89 // DW_AT_call_line
|
| 1008 |
+
.b8 11 // DW_FORM_data1
|
| 1009 |
+
.b8 87 // DW_AT_call_column
|
| 1010 |
+
.b8 11 // DW_FORM_data1
|
| 1011 |
+
.b8 0 // EOM(1)
|
| 1012 |
+
.b8 0 // EOM(2)
|
| 1013 |
+
.b8 0 // EOM(3)
|
| 1014 |
+
}
|
| 1015 |
+
.section .debug_info
|
| 1016 |
+
{
|
| 1017 |
+
.b32 234 // Length of Unit
|
| 1018 |
+
.b8 2 // DWARF version number
|
| 1019 |
+
.b8 0
|
| 1020 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 1021 |
+
.b8 8 // Address Size (in bytes)
|
| 1022 |
+
.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit
|
| 1023 |
+
.b8 116 // DW_AT_producer
|
| 1024 |
+
.b8 114
|
| 1025 |
+
.b8 105
|
| 1026 |
+
.b8 116
|
| 1027 |
+
.b8 111
|
| 1028 |
+
.b8 110
|
| 1029 |
+
.b8 0
|
| 1030 |
+
.b8 2 // DW_AT_language
|
| 1031 |
+
.b8 0
|
| 1032 |
+
.b8 99 // DW_AT_name
|
| 1033 |
+
.b8 102
|
| 1034 |
+
.b8 118
|
| 1035 |
+
.b8 109
|
| 1036 |
+
.b8 54
|
| 1037 |
+
.b8 53
|
| 1038 |
+
.b8 53
|
| 1039 |
+
.b8 106
|
| 1040 |
+
.b8 53
|
| 1041 |
+
.b8 99
|
| 1042 |
+
.b8 109
|
| 1043 |
+
.b8 52
|
| 1044 |
+
.b8 53
|
| 1045 |
+
.b8 50
|
| 1046 |
+
.b8 52
|
| 1047 |
+
.b8 103
|
| 1048 |
+
.b8 109
|
| 1049 |
+
.b8 100
|
| 1050 |
+
.b8 121
|
| 1051 |
+
.b8 104
|
| 1052 |
+
.b8 114
|
| 1053 |
+
.b8 55
|
| 1054 |
+
.b8 121
|
| 1055 |
+
.b8 108
|
| 1056 |
+
.b8 105
|
| 1057 |
+
.b8 54
|
| 1058 |
+
.b8 100
|
| 1059 |
+
.b8 102
|
| 1060 |
+
.b8 102
|
| 1061 |
+
.b8 112
|
| 1062 |
+
.b8 97
|
| 1063 |
+
.b8 107
|
| 1064 |
+
.b8 121
|
| 1065 |
+
.b8 115
|
| 1066 |
+
.b8 117
|
| 1067 |
+
.b8 121
|
| 1068 |
+
.b8 99
|
| 1069 |
+
.b8 117
|
| 1070 |
+
.b8 111
|
| 1071 |
+
.b8 122
|
| 1072 |
+
.b8 107
|
| 1073 |
+
.b8 113
|
| 1074 |
+
.b8 109
|
| 1075 |
+
.b8 121
|
| 1076 |
+
.b8 117
|
| 1077 |
+
.b8 97
|
| 1078 |
+
.b8 111
|
| 1079 |
+
.b8 110
|
| 1080 |
+
.b8 107
|
| 1081 |
+
.b8 119
|
| 1082 |
+
.b8 98
|
| 1083 |
+
.b8 103
|
| 1084 |
+
.b8 46
|
| 1085 |
+
.b8 112
|
| 1086 |
+
.b8 121
|
| 1087 |
+
.b8 0
|
| 1088 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 1089 |
+
.b8 47 // DW_AT_comp_dir
|
| 1090 |
+
.b8 119
|
| 1091 |
+
.b8 111
|
| 1092 |
+
.b8 114
|
| 1093 |
+
.b8 107
|
| 1094 |
+
.b8 115
|
| 1095 |
+
.b8 112
|
| 1096 |
+
.b8 97
|
| 1097 |
+
.b8 99
|
| 1098 |
+
.b8 101
|
| 1099 |
+
.b8 47
|
| 1100 |
+
.b8 104
|
| 1101 |
+
.b8 97
|
| 1102 |
+
.b8 110
|
| 1103 |
+
.b8 114
|
| 1104 |
+
.b8 117
|
| 1105 |
+
.b8 105
|
| 1106 |
+
.b8 47
|
| 1107 |
+
.b8 83
|
| 1108 |
+
.b8 112
|
| 1109 |
+
.b8 101
|
| 1110 |
+
.b8 99
|
| 1111 |
+
.b8 70
|
| 1112 |
+
.b8 111
|
| 1113 |
+
.b8 114
|
| 1114 |
+
.b8 103
|
| 1115 |
+
.b8 101
|
| 1116 |
+
.b8 45
|
| 1117 |
+
.b8 101
|
| 1118 |
+
.b8 120
|
| 1119 |
+
.b8 116
|
| 1120 |
+
.b8 47
|
| 1121 |
+
.b8 99
|
| 1122 |
+
.b8 97
|
| 1123 |
+
.b8 99
|
| 1124 |
+
.b8 104
|
| 1125 |
+
.b8 101
|
| 1126 |
+
.b8 47
|
| 1127 |
+
.b8 99
|
| 1128 |
+
.b8 111
|
| 1129 |
+
.b8 109
|
| 1130 |
+
.b8 112
|
| 1131 |
+
.b8 105
|
| 1132 |
+
.b8 108
|
| 1133 |
+
.b8 101
|
| 1134 |
+
.b8 100
|
| 1135 |
+
.b8 95
|
| 1136 |
+
.b8 107
|
| 1137 |
+
.b8 101
|
| 1138 |
+
.b8 114
|
| 1139 |
+
.b8 110
|
| 1140 |
+
.b8 101
|
| 1141 |
+
.b8 108
|
| 1142 |
+
.b8 115
|
| 1143 |
+
.b8 47
|
| 1144 |
+
.b8 102
|
| 1145 |
+
.b8 118
|
| 1146 |
+
.b8 0
|
| 1147 |
+
.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram
|
| 1148 |
+
.b8 116 // DW_AT_name
|
| 1149 |
+
.b8 114
|
| 1150 |
+
.b8 105
|
| 1151 |
+
.b8 116
|
| 1152 |
+
.b8 111
|
| 1153 |
+
.b8 110
|
| 1154 |
+
.b8 95
|
| 1155 |
+
.b8 114
|
| 1156 |
+
.b8 101
|
| 1157 |
+
.b8 100
|
| 1158 |
+
.b8 95
|
| 1159 |
+
.b8 102
|
| 1160 |
+
.b8 117
|
| 1161 |
+
.b8 115
|
| 1162 |
+
.b8 101
|
| 1163 |
+
.b8 100
|
| 1164 |
+
.b8 95
|
| 1165 |
+
.b8 97
|
| 1166 |
+
.b8 114
|
| 1167 |
+
.b8 103
|
| 1168 |
+
.b8 109
|
| 1169 |
+
.b8 97
|
| 1170 |
+
.b8 120
|
| 1171 |
+
.b8 95
|
| 1172 |
+
.b8 49
|
| 1173 |
+
.b8 0
|
| 1174 |
+
.b8 1 // DW_AT_inline
|
| 1175 |
+
.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram
|
| 1176 |
+
.b64 $L__func_begin0 // DW_AT_low_pc
|
| 1177 |
+
.b64 $L__func_end0 // DW_AT_high_pc
|
| 1178 |
+
.b32 139 // DW_AT_abstract_origin
|
| 1179 |
+
.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine
|
| 1180 |
+
.b32 139 // DW_AT_abstract_origin
|
| 1181 |
+
.b64 $L__tmp1 // DW_AT_low_pc
|
| 1182 |
+
.b64 $L__tmp8 // DW_AT_high_pc
|
| 1183 |
+
.b8 1 // DW_AT_call_file
|
| 1184 |
+
.b8 42 // DW_AT_call_line
|
| 1185 |
+
.b8 38 // DW_AT_call_column
|
| 1186 |
+
.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine
|
| 1187 |
+
.b32 139 // DW_AT_abstract_origin
|
| 1188 |
+
.b64 $L__tmp9 // DW_AT_low_pc
|
| 1189 |
+
.b64 $L__tmp10 // DW_AT_high_pc
|
| 1190 |
+
.b8 1 // DW_AT_call_file
|
| 1191 |
+
.b8 46 // DW_AT_call_line
|
| 1192 |
+
.b8 75 // DW_AT_call_column
|
| 1193 |
+
.b8 0 // End Of Children Mark
|
| 1194 |
+
.b8 0 // End Of Children Mark
|
| 1195 |
+
}
|
| 1196 |
+
.section .debug_macinfo { }
|
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.source
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":18:0)
|
| 2 |
+
#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0)
|
| 3 |
+
#loc45 = loc(unknown)
|
| 4 |
+
#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0)
|
| 5 |
+
#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0)
|
| 6 |
+
#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0)
|
| 7 |
+
#loc70 = loc("in_ptr0"(#loc))
|
| 8 |
+
#loc71 = loc("out_ptr0"(#loc))
|
| 9 |
+
#loc72 = loc("xnumel"(#loc))
|
| 10 |
+
#loc73 = loc("r0_numel"(#loc))
|
| 11 |
+
#loc100 = loc("a_value"(#loc33))
|
| 12 |
+
#loc101 = loc("a_index"(#loc33))
|
| 13 |
+
#loc102 = loc("b_value"(#loc33))
|
| 14 |
+
#loc103 = loc("b_index"(#loc33))
|
| 15 |
+
#loc116 = loc("x"(#loc53))
|
| 16 |
+
#loc117 = loc("x"(#loc57))
|
| 17 |
+
#loc118 = loc("value"(#loc66))
|
| 18 |
+
#loc119 = loc("index"(#loc66))
|
| 19 |
+
module {
|
| 20 |
+
tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 21 |
+
%xnumel_0 = arith.constant 16384 : i32 loc(#loc74)
|
| 22 |
+
%r0_numel_1 = arith.constant 32000 : i32 loc(#loc75)
|
| 23 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc76)
|
| 24 |
+
%xoffset_2 = arith.constant 8 : i32 loc(#loc77)
|
| 25 |
+
%xoffset_3 = arith.constant 8 : i32 loc(#loc77)
|
| 26 |
+
%xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc77)
|
| 27 |
+
%xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc78)
|
| 28 |
+
%xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc79)
|
| 29 |
+
%xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc80)
|
| 30 |
+
%xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc80)
|
| 31 |
+
%xmask = arith.constant true loc(#loc81)
|
| 32 |
+
%xmask_8 = arith.constant dense<true> : tensor<8x512xi1> loc(#loc81)
|
| 33 |
+
%r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc82)
|
| 34 |
+
%r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32> -> tensor<1x512xi32> loc(#loc83)
|
| 35 |
+
%x0 = arith.constant 2048 : i32 loc(#loc84)
|
| 36 |
+
%x0_10 = arith.constant 2048 : i32 loc(#loc84)
|
| 37 |
+
%x0_11 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc84)
|
| 38 |
+
%x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc84)
|
| 39 |
+
%x1 = arith.constant 2048 : i32 loc(#loc85)
|
| 40 |
+
%x1_13 = arith.constant 2048 : i32 loc(#loc85)
|
| 41 |
+
%x1_14 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc85)
|
| 42 |
+
%x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc85)
|
| 43 |
+
%_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc86)
|
| 44 |
+
%_tmp2_16 = arith.constant dense<0xFF800000> : tensor<8x512xf32> loc(#loc86)
|
| 45 |
+
%_tmp2_index = arith.constant 2147483647 : i32 loc(#loc87)
|
| 46 |
+
%_tmp2_index_17 = arith.constant dense<2147483647> : tensor<8x512xi32> loc(#loc87)
|
| 47 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc15)
|
| 48 |
+
%c512_i32 = arith.constant 512 : i32 loc(#loc15)
|
| 49 |
+
%0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
|
| 50 |
+
%1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15)
|
| 51 |
+
%2 = arith.bitcast %c512_i32 : i32 to i32 loc(#loc15)
|
| 52 |
+
%3 = ub.poison : i32 loc(#loc15)
|
| 53 |
+
%_tmp2_index_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_19 = %_tmp2_16, %_tmp2_index_20 = %_tmp2_index_17) -> (tensor<8x512xf32>, tensor<8x512xi32>) : i32 {
|
| 54 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x512xi32> loc(#loc89)
|
| 55 |
+
%r0_index_21 = arith.addi %r0_index, %r0_base_9 : tensor<1x512xi32> loc(#loc89)
|
| 56 |
+
%r0_mask = arith.constant dense<32000> : tensor<1x512xi32> loc(#loc90)
|
| 57 |
+
%r0_mask_22 = arith.cmpi slt, %r0_index_21, %r0_mask : tensor<1x512xi32> loc(#loc90)
|
| 58 |
+
%tmp0 = arith.constant 32000 : i32 loc(#loc91)
|
| 59 |
+
%tmp0_23 = arith.constant 32000 : i32 loc(#loc91)
|
| 60 |
+
%tmp0_24 = arith.constant dense<32000> : tensor<8x1xi32> loc(#loc91)
|
| 61 |
+
%tmp0_25 = arith.muli %tmp0_24, %x0_12 : tensor<8x1xi32> loc(#loc91)
|
| 62 |
+
%tmp0_26 = tt.broadcast %r0_index_21 : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc92)
|
| 63 |
+
%tmp0_27 = tt.broadcast %tmp0_25 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc92)
|
| 64 |
+
%tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<8x512xi32> loc(#loc92)
|
| 65 |
+
%tmp0_29 = arith.constant 65760000 : i32 loc(#loc93)
|
| 66 |
+
%tmp0_30 = arith.constant 65760000 : i32 loc(#loc93)
|
| 67 |
+
%tmp0_31 = arith.constant dense<65760000> : tensor<8x1xi32> loc(#loc93)
|
| 68 |
+
%tmp0_32 = arith.muli %tmp0_31, %x1_15 : tensor<8x1xi32> loc(#loc93)
|
| 69 |
+
%tmp0_33 = tt.broadcast %tmp0_32 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc94)
|
| 70 |
+
%tmp0_34 = arith.addi %tmp0_28, %tmp0_33 : tensor<8x512xi32> loc(#loc94)
|
| 71 |
+
%tmp0_35 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<8x512x!tt.ptr<f32>> loc(#loc95)
|
| 72 |
+
%tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<8x512x!tt.ptr<f32>>, tensor<8x512xi32> loc(#loc95)
|
| 73 |
+
%tmp0_37 = arith.constant 0.000000e+00 : f32 loc(#loc96)
|
| 74 |
+
%tmp0_38 = tt.broadcast %r0_mask_22 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc96)
|
| 75 |
+
%tmp0_39 = arith.constant dense<0.000000e+00> : tensor<8x512xf32> loc(#loc96)
|
| 76 |
+
%tmp0_40 = tt.load %tmp0_36, %tmp0_38, %tmp0_39 evictionPolicy = evict_first : tensor<8x512x!tt.ptr<f32>> loc(#loc96)
|
| 77 |
+
%8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S8_512S_i32S8_512S_fp32S8_512S_i32S1_512S__(%_tmp2_19, %_tmp2_index_20, %tmp0_40, %r0_index_21) : (tensor<8x512xf32>, tensor<8x512xi32>, tensor<8x512xf32>, tensor<1x512xi32>) -> (tensor<8x512xf32>, tensor<8x512xi32>) loc(#loc24)
|
| 78 |
+
%_tmp2_41 = tt.broadcast %r0_mask_22 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc97)
|
| 79 |
+
%_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_19 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc97)
|
| 80 |
+
%_tmp2_index_43 = tt.broadcast %r0_mask_22 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc98)
|
| 81 |
+
%_tmp2_index_44 = arith.select %_tmp2_index_43, %8#1, %_tmp2_index_20 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc98)
|
| 82 |
+
scf.yield %_tmp2_42, %_tmp2_index_44 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc27)
|
| 83 |
+
} loc(#loc120)
|
| 84 |
+
%4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S8_512S_i32S8_512S__(2,)cconstexpr_1_"(%_tmp2_index_18#0, %_tmp2_index_18#1) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc28)
|
| 85 |
+
%tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc99)
|
| 86 |
+
%5 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<8x1x!tt.ptr<i64>> loc(#loc30)
|
| 87 |
+
%6 = tt.addptr %5, %xindex_7 : tensor<8x1x!tt.ptr<i64>>, tensor<8x1xi32> loc(#loc30)
|
| 88 |
+
%7 = arith.extsi %tmp2 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc31)
|
| 89 |
+
tt.store %6, %7 : tensor<8x1x!tt.ptr<i64>> loc(#loc31)
|
| 90 |
+
tt.return loc(#loc32)
|
| 91 |
+
} loc(#loc)
|
| 92 |
+
tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S8_512S_i32S8_512S_fp32S8_512S_i32S1_512S__(%a_value: tensor<8x512xf32> loc("a_value"(#loc33)), %a_index: tensor<8x512xi32> loc("a_index"(#loc33)), %b_value: tensor<8x512xf32> loc("b_value"(#loc33)), %b_index: tensor<1x512xi32> loc("b_index"(#loc33))) -> (tensor<8x512xf32>, tensor<8x512xi32>) attributes {noinline = false} {
|
| 93 |
+
%mask = arith.cmpf ogt, %a_value, %b_value : tensor<8x512xf32> loc(#loc121)
|
| 94 |
+
%equal = arith.cmpf oeq, %a_value, %b_value : tensor<8x512xf32> loc(#loc122)
|
| 95 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S8_512S__(%a_value) : (tensor<8x512xf32>) -> i1 loc(#loc36)
|
| 96 |
+
%1:2 = scf.if %0 -> (tensor<8x512xi1>, tensor<8x512xi1>) {
|
| 97 |
+
%a_isnan = arith.cmpf une, %a_value, %a_value : tensor<8x512xf32> loc(#loc106)
|
| 98 |
+
%b_isnan = arith.cmpf une, %b_value, %b_value : tensor<8x512xf32> loc(#loc107)
|
| 99 |
+
%mask_4 = arith.constant true loc(#loc108)
|
| 100 |
+
%mask_5 = arith.constant dense<true> : tensor<8x512xi1> loc(#loc108)
|
| 101 |
+
%mask_6 = arith.xori %b_isnan, %mask_5 : tensor<8x512xi1> loc(#loc108)
|
| 102 |
+
%mask_7 = arith.andi %a_isnan, %mask_6 : tensor<8x512xi1> loc(#loc109)
|
| 103 |
+
%mask_8 = arith.ori %mask, %mask_7 : tensor<8x512xi1> loc(#loc123)
|
| 104 |
+
%equal_9 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1> loc(#loc111)
|
| 105 |
+
%equal_10 = arith.ori %equal, %equal_9 : tensor<8x512xi1> loc(#loc124)
|
| 106 |
+
scf.yield %mask_8, %equal_10 : tensor<8x512xi1>, tensor<8x512xi1> loc(#loc124)
|
| 107 |
+
} else {
|
| 108 |
+
scf.yield %mask, %equal : tensor<8x512xi1>, tensor<8x512xi1> loc(#loc45)
|
| 109 |
+
} loc(#loc37)
|
| 110 |
+
%mask_0 = tt.broadcast %b_index : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc113)
|
| 111 |
+
%mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<8x512xi32> loc(#loc113)
|
| 112 |
+
%mask_2 = arith.andi %1#1, %mask_1 : tensor<8x512xi1> loc(#loc114)
|
| 113 |
+
%mask_3 = arith.ori %1#0, %mask_2 : tensor<8x512xi1> loc(#loc115)
|
| 114 |
+
%2 = arith.select %mask_3, %a_value, %b_value : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc49)
|
| 115 |
+
%3 = tt.broadcast %b_index : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc50)
|
| 116 |
+
%4 = arith.select %mask_3, %a_index, %3 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc50)
|
| 117 |
+
tt.return %2, %4 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc51)
|
| 118 |
+
^bb1: // no predecessors
|
| 119 |
+
%5 = ub.poison : tensor<8x512xf32> loc(#loc52)
|
| 120 |
+
%6 = ub.poison : tensor<8x512xi32> loc(#loc52)
|
| 121 |
+
tt.return %5, %6 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc52)
|
| 122 |
+
} loc(#loc33)
|
| 123 |
+
tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S8_512S__(%x: tensor<8x512xf32> loc("x"(#loc53))) -> i1 attributes {noinline = false} {
|
| 124 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S8_512S__(%x) : (tensor<8x512xf32>) -> tensor<8x512xf32> loc(#loc54)
|
| 125 |
+
%true = arith.constant true loc(#loc55)
|
| 126 |
+
tt.return %true : i1 loc(#loc55)
|
| 127 |
+
^bb1: // no predecessors
|
| 128 |
+
%1 = ub.poison : i1 loc(#loc56)
|
| 129 |
+
tt.return %1 : i1 loc(#loc56)
|
| 130 |
+
} loc(#loc53)
|
| 131 |
+
tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S8_512S__(%x: tensor<8x512xf32> loc("x"(#loc57))) -> tensor<8x512xf32> attributes {noinline = false} {
|
| 132 |
+
%0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58)
|
| 133 |
+
%1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59)
|
| 134 |
+
%2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc59)
|
| 135 |
+
%3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<8x512xf32> loc(#loc59)
|
| 136 |
+
%4 = arith.addf %x, %3 : tensor<8x512xf32> loc(#loc59)
|
| 137 |
+
tt.return %4 : tensor<8x512xf32> loc(#loc60)
|
| 138 |
+
^bb1: // no predecessors
|
| 139 |
+
%5 = ub.poison : tensor<8x512xf32> loc(#loc61)
|
| 140 |
+
tt.return %5 : tensor<8x512xf32> loc(#loc61)
|
| 141 |
+
} loc(#loc57)
|
| 142 |
+
tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} {
|
| 143 |
+
%false = arith.constant false loc(#loc63)
|
| 144 |
+
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc63)
|
| 145 |
+
tt.return %cst : tensor<1xi1> loc(#loc64)
|
| 146 |
+
^bb1: // no predecessors
|
| 147 |
+
%0 = ub.poison : tensor<1xi1> loc(#loc65)
|
| 148 |
+
tt.return %0 : tensor<1xi1> loc(#loc65)
|
| 149 |
+
} loc(#loc62)
|
| 150 |
+
tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S8_512S_i32S8_512S__(2,)cconstexpr_1_"(%value: tensor<8x512xf32> loc("value"(#loc66)), %index: tensor<8x512xi32> loc("index"(#loc66))) -> (tensor<8xf32>, tensor<8xi32>) attributes {noinline = false} {
|
| 151 |
+
%0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({
|
| 152 |
+
^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)):
|
| 153 |
+
%3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc67)
|
| 154 |
+
tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc67)
|
| 155 |
+
}) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc67)
|
| 156 |
+
tt.return %0#0, %0#1 : tensor<8xf32>, tensor<8xi32> loc(#loc68)
|
| 157 |
+
^bb1: // no predecessors
|
| 158 |
+
%1 = ub.poison : tensor<8xf32> loc(#loc69)
|
| 159 |
+
%2 = ub.poison : tensor<8xi32> loc(#loc69)
|
| 160 |
+
tt.return %1, %2 : tensor<8xf32>, tensor<8xi32> loc(#loc69)
|
| 161 |
+
} loc(#loc66)
|
| 162 |
+
tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc33)), %a_index: i32 loc("a_index"(#loc33)), %b_value: f32 loc("b_value"(#loc33)), %b_index: i32 loc("b_index"(#loc33))) -> (f32, i32) attributes {noinline = false} {
|
| 163 |
+
%mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc121)
|
| 164 |
+
%equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc122)
|
| 165 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc36)
|
| 166 |
+
%1:2 = scf.if %0 -> (i1, i1) {
|
| 167 |
+
%a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc106)
|
| 168 |
+
%b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc107)
|
| 169 |
+
%mask_3 = arith.constant true loc(#loc108)
|
| 170 |
+
%mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc108)
|
| 171 |
+
%mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc109)
|
| 172 |
+
%mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc123)
|
| 173 |
+
%equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc111)
|
| 174 |
+
%equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc124)
|
| 175 |
+
scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc124)
|
| 176 |
+
} else {
|
| 177 |
+
scf.yield %mask, %equal : i1, i1 loc(#loc45)
|
| 178 |
+
} loc(#loc37)
|
| 179 |
+
%mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc113)
|
| 180 |
+
%mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc114)
|
| 181 |
+
%mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc115)
|
| 182 |
+
%2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc49)
|
| 183 |
+
%3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc50)
|
| 184 |
+
tt.return %2, %3 : f32, i32 loc(#loc51)
|
| 185 |
+
^bb1: // no predecessors
|
| 186 |
+
%4 = ub.poison : f32 loc(#loc52)
|
| 187 |
+
%5 = ub.poison : i32 loc(#loc52)
|
| 188 |
+
tt.return %4, %5 : f32, i32 loc(#loc52)
|
| 189 |
+
} loc(#loc33)
|
| 190 |
+
tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc53))) -> i1 attributes {noinline = false} {
|
| 191 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc54)
|
| 192 |
+
%true = arith.constant true loc(#loc55)
|
| 193 |
+
tt.return %true : i1 loc(#loc55)
|
| 194 |
+
^bb1: // no predecessors
|
| 195 |
+
%1 = ub.poison : i1 loc(#loc56)
|
| 196 |
+
tt.return %1 : i1 loc(#loc56)
|
| 197 |
+
} loc(#loc53)
|
| 198 |
+
tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc57))) -> tensor<1xf32> attributes {noinline = false} {
|
| 199 |
+
%0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58)
|
| 200 |
+
%1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59)
|
| 201 |
+
%2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc59)
|
| 202 |
+
%3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc59)
|
| 203 |
+
tt.return %3 : tensor<1xf32> loc(#loc60)
|
| 204 |
+
^bb1: // no predecessors
|
| 205 |
+
%4 = ub.poison : tensor<1xf32> loc(#loc61)
|
| 206 |
+
tt.return %4 : tensor<1xf32> loc(#loc61)
|
| 207 |
+
} loc(#loc57)
|
| 208 |
+
} loc(#loc)
|
| 209 |
+
#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":19:13)
|
| 210 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":20:15)
|
| 211 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:28)
|
| 212 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:33)
|
| 213 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:36)
|
| 214 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:44)
|
| 215 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:23)
|
| 216 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":25:46)
|
| 217 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:27)
|
| 218 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:37)
|
| 219 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":28:19)
|
| 220 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":29:19)
|
| 221 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":30:55)
|
| 222 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":31:58)
|
| 223 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":33:40)
|
| 224 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":34:31)
|
| 225 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":35:29)
|
| 226 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:47)
|
| 227 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:41)
|
| 228 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:61)
|
| 229 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:52)
|
| 230 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:34)
|
| 231 |
+
#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:66)
|
| 232 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":42:38)
|
| 233 |
+
#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":44:46)
|
| 234 |
+
#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:58)
|
| 235 |
+
#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:8)
|
| 236 |
+
#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":46:75)
|
| 237 |
+
#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":47:20)
|
| 238 |
+
#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:25)
|
| 239 |
+
#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:36)
|
| 240 |
+
#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:4)
|
| 241 |
+
#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
|
| 242 |
+
#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
|
| 243 |
+
#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19)
|
| 244 |
+
#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7)
|
| 245 |
+
#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
|
| 246 |
+
#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
|
| 247 |
+
#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
|
| 248 |
+
#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
|
| 249 |
+
#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
|
| 250 |
+
#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
|
| 251 |
+
#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
|
| 252 |
+
#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
|
| 253 |
+
#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
|
| 254 |
+
#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
|
| 255 |
+
#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
|
| 256 |
+
#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
|
| 257 |
+
#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11)
|
| 258 |
+
#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4)
|
| 259 |
+
#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29)
|
| 260 |
+
#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11)
|
| 261 |
+
#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4)
|
| 262 |
+
#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30)
|
| 263 |
+
#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15)
|
| 264 |
+
#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11)
|
| 265 |
+
#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4)
|
| 266 |
+
#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0)
|
| 267 |
+
#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
|
| 268 |
+
#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11)
|
| 269 |
+
#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4)
|
| 270 |
+
#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
|
| 271 |
+
#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11)
|
| 272 |
+
#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4)
|
| 273 |
+
#loc74 = loc("xnumel"(#loc1))
|
| 274 |
+
#loc75 = loc("r0_numel"(#loc2))
|
| 275 |
+
#loc76 = loc("xoffset"(#loc3))
|
| 276 |
+
#loc77 = loc("xoffset"(#loc4))
|
| 277 |
+
#loc78 = loc("xindex"(#loc5))
|
| 278 |
+
#loc79 = loc("xindex"(#loc6))
|
| 279 |
+
#loc80 = loc("xindex"(#loc7))
|
| 280 |
+
#loc81 = loc("xmask"(#loc8))
|
| 281 |
+
#loc82 = loc("r0_base"(#loc9))
|
| 282 |
+
#loc83 = loc("r0_base"(#loc10))
|
| 283 |
+
#loc84 = loc("x0"(#loc11))
|
| 284 |
+
#loc85 = loc("x1"(#loc12))
|
| 285 |
+
#loc86 = loc("_tmp2"(#loc13))
|
| 286 |
+
#loc87 = loc("_tmp2_index"(#loc14))
|
| 287 |
+
#loc88 = loc("_tmp2"(#loc15))
|
| 288 |
+
#loc89 = loc("r0_index"(#loc16))
|
| 289 |
+
#loc90 = loc("r0_mask"(#loc17))
|
| 290 |
+
#loc91 = loc("tmp0"(#loc18))
|
| 291 |
+
#loc92 = loc("tmp0"(#loc19))
|
| 292 |
+
#loc93 = loc("tmp0"(#loc20))
|
| 293 |
+
#loc94 = loc("tmp0"(#loc21))
|
| 294 |
+
#loc95 = loc("tmp0"(#loc22))
|
| 295 |
+
#loc96 = loc("tmp0"(#loc23))
|
| 296 |
+
#loc97 = loc("_tmp2"(#loc25))
|
| 297 |
+
#loc98 = loc("_tmp2_index"(#loc26))
|
| 298 |
+
#loc99 = loc("tmp2"(#loc29))
|
| 299 |
+
#loc104 = loc("mask"(#loc34))
|
| 300 |
+
#loc105 = loc("equal"(#loc35))
|
| 301 |
+
#loc106 = loc("a_isnan"(#loc38))
|
| 302 |
+
#loc107 = loc("b_isnan"(#loc39))
|
| 303 |
+
#loc108 = loc("mask"(#loc40))
|
| 304 |
+
#loc109 = loc("mask"(#loc41))
|
| 305 |
+
#loc110 = loc("mask"(#loc42))
|
| 306 |
+
#loc111 = loc("equal"(#loc43))
|
| 307 |
+
#loc112 = loc("equal"(#loc44))
|
| 308 |
+
#loc113 = loc("mask"(#loc46))
|
| 309 |
+
#loc114 = loc("mask"(#loc47))
|
| 310 |
+
#loc115 = loc("mask"(#loc48))
|
| 311 |
+
#loc120 = loc("_tmp2_index"(#loc88))
|
| 312 |
+
#loc121 = loc("mask"(#loc104))
|
| 313 |
+
#loc122 = loc("equal"(#loc105))
|
| 314 |
+
#loc123 = loc("mask"(#loc110))
|
| 315 |
+
#loc124 = loc("equal"(#loc112))
|
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttgir
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 4], order = [1, 0]}>
|
| 2 |
+
#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 16], order = [0, 1]}>
|
| 3 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":18:0)
|
| 4 |
+
#loc1 = loc(unknown)
|
| 5 |
+
#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":46:75)
|
| 6 |
+
#loc42 = loc("in_ptr0"(#loc))
|
| 7 |
+
#loc43 = loc("out_ptr0"(#loc))
|
| 8 |
+
#loc44 = loc("xnumel"(#loc))
|
| 9 |
+
#loc45 = loc("r0_numel"(#loc))
|
| 10 |
+
#loc79 = loc(callsite(#loc1 at #loc37))
|
| 11 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 12 |
+
tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 13 |
+
%cst = arith.constant dense<65760000> : tensor<8x1xi32, #blocked> loc(#loc1)
|
| 14 |
+
%cst_0 = arith.constant dense<32000> : tensor<8x1xi32, #blocked> loc(#loc1)
|
| 15 |
+
%cst_1 = arith.constant dense<32000> : tensor<1x512xi32, #blocked> loc(#loc1)
|
| 16 |
+
%cst_2 = arith.constant dense<2048> : tensor<8x1xi32, #blocked> loc(#loc1)
|
| 17 |
+
%c8_i32 = arith.constant 8 : i32 loc(#loc1)
|
| 18 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<8x512xf32, #blocked> loc(#loc1)
|
| 19 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
|
| 20 |
+
%c32000_i32 = arith.constant 32000 : i32 loc(#loc1)
|
| 21 |
+
%c512_i32 = arith.constant 512 : i32 loc(#loc1)
|
| 22 |
+
%true = arith.constant true loc(#loc1)
|
| 23 |
+
%cst_4 = arith.constant dense<true> : tensor<8x512xi1, #blocked> loc(#loc1)
|
| 24 |
+
%cst_5 = arith.constant dense<2147483647> : tensor<8x512xi32, #blocked> loc(#loc1)
|
| 25 |
+
%cst_6 = arith.constant dense<0xFF800000> : tensor<8x512xf32, #blocked> loc(#loc1)
|
| 26 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc46)
|
| 27 |
+
%xoffset_7 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc47)
|
| 28 |
+
%xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48)
|
| 29 |
+
%xindex_8 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48)
|
| 30 |
+
%xindex_9 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc48)
|
| 31 |
+
%xindex_10 = tt.expand_dims %xindex_8 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc48)
|
| 32 |
+
%xindex_11 = tt.splat %xoffset_7 : i32 -> tensor<8x1xi32, #blocked> loc(#loc49)
|
| 33 |
+
%xindex_12 = tt.splat %xoffset_7 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc49)
|
| 34 |
+
%xindex_13 = arith.addi %xindex_11, %xindex_9 : tensor<8x1xi32, #blocked> loc(#loc49)
|
| 35 |
+
%xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<8x1xi32, #blocked1> loc(#loc49)
|
| 36 |
+
%r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc50)
|
| 37 |
+
%r0_base_15 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x512xi32, #blocked> loc(#loc50)
|
| 38 |
+
%x0 = arith.remsi %xindex_13, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc51)
|
| 39 |
+
%x1 = arith.divsi %xindex_13, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc52)
|
| 40 |
+
%tmp0 = arith.muli %x0, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc53)
|
| 41 |
+
%tmp0_16 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x512xi32, #blocked> loc(#loc54)
|
| 42 |
+
%tmp0_17 = arith.muli %x1, %cst : tensor<8x1xi32, #blocked> loc(#loc55)
|
| 43 |
+
%tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32, #blocked> -> tensor<8x512xi32, #blocked> loc(#loc56)
|
| 44 |
+
%tmp0_19 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<8x512x!tt.ptr<f32>, #blocked> loc(#loc57)
|
| 45 |
+
%_tmp2_index:2 = scf.for %_tmp2_index_20 = %c0_i32 to %c32000_i32 step %c512_i32 iter_args(%_tmp2 = %cst_6, %_tmp2_index_21 = %cst_5) -> (tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked>) : i32 {
|
| 46 |
+
%r0_index = tt.splat %_tmp2_index_20 : i32 -> tensor<1x512xi32, #blocked> loc(#loc59)
|
| 47 |
+
%r0_index_22 = arith.addi %r0_index, %r0_base_15 : tensor<1x512xi32, #blocked> loc(#loc59)
|
| 48 |
+
%r0_mask = arith.cmpi slt, %r0_index_22, %cst_1 : tensor<1x512xi32, #blocked> loc(#loc60)
|
| 49 |
+
%tmp0_23 = tt.broadcast %r0_index_22 : tensor<1x512xi32, #blocked> -> tensor<8x512xi32, #blocked> loc(#loc54)
|
| 50 |
+
%tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<8x512xi32, #blocked> loc(#loc54)
|
| 51 |
+
%tmp0_25 = arith.addi %tmp0_24, %tmp0_18 : tensor<8x512xi32, #blocked> loc(#loc56)
|
| 52 |
+
%tmp0_26 = tt.addptr %tmp0_19, %tmp0_25 : tensor<8x512x!tt.ptr<f32>, #blocked>, tensor<8x512xi32, #blocked> loc(#loc57)
|
| 53 |
+
%tmp0_27 = tt.broadcast %r0_mask : tensor<1x512xi1, #blocked> -> tensor<8x512xi1, #blocked> loc(#loc61)
|
| 54 |
+
%tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_3 evictionPolicy = evict_first : tensor<8x512x!tt.ptr<f32>, #blocked> loc(#loc61)
|
| 55 |
+
%mask = arith.cmpf ogt, %_tmp2, %tmp0_28 : tensor<8x512xf32, #blocked> loc(#loc104)
|
| 56 |
+
%equal = arith.cmpf oeq, %_tmp2, %tmp0_28 : tensor<8x512xf32, #blocked> loc(#loc105)
|
| 57 |
+
%a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<8x512xf32, #blocked> loc(#loc84)
|
| 58 |
+
%b_isnan = arith.cmpf une, %tmp0_28, %tmp0_28 : tensor<8x512xf32, #blocked> loc(#loc85)
|
| 59 |
+
%mask_29 = arith.xori %b_isnan, %cst_4 : tensor<8x512xi1, #blocked> loc(#loc86)
|
| 60 |
+
%mask_30 = arith.andi %a_isnan, %mask_29 : tensor<8x512xi1, #blocked> loc(#loc87)
|
| 61 |
+
%mask_31 = arith.ori %mask, %mask_30 : tensor<8x512xi1, #blocked> loc(#loc106)
|
| 62 |
+
%equal_32 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1, #blocked> loc(#loc89)
|
| 63 |
+
%equal_33 = arith.ori %equal, %equal_32 : tensor<8x512xi1, #blocked> loc(#loc107)
|
| 64 |
+
%mask_34 = arith.cmpi slt, %_tmp2_index_21, %tmp0_23 : tensor<8x512xi32, #blocked> loc(#loc91)
|
| 65 |
+
%mask_35 = arith.andi %equal_33, %mask_34 : tensor<8x512xi1, #blocked> loc(#loc92)
|
| 66 |
+
%mask_36 = arith.ori %mask_31, %mask_35 : tensor<8x512xi1, #blocked> loc(#loc93)
|
| 67 |
+
%5 = arith.select %mask_36, %_tmp2, %tmp0_28 : tensor<8x512xi1, #blocked>, tensor<8x512xf32, #blocked> loc(#loc74)
|
| 68 |
+
%6 = arith.select %mask_36, %_tmp2_index_21, %tmp0_23 : tensor<8x512xi1, #blocked>, tensor<8x512xi32, #blocked> loc(#loc75)
|
| 69 |
+
%_tmp2_37 = arith.select %tmp0_27, %5, %_tmp2 : tensor<8x512xi1, #blocked>, tensor<8x512xf32, #blocked> loc(#loc76)
|
| 70 |
+
%_tmp2_index_38 = arith.select %tmp0_27, %6, %_tmp2_index_21 : tensor<8x512xi1, #blocked>, tensor<8x512xi32, #blocked> loc(#loc77)
|
| 71 |
+
scf.yield %_tmp2_37, %_tmp2_index_38 : tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked> loc(#loc35)
|
| 72 |
+
} loc(#loc81)
|
| 73 |
+
%0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({
|
| 74 |
+
^bb0(%arg4: f32 loc(callsite(#loc1 at #loc37)), %arg5: i32 loc(callsite(#loc1 at #loc37)), %arg6: f32 loc(callsite(#loc1 at #loc37)), %arg7: i32 loc(callsite(#loc1 at #loc37))):
|
| 75 |
+
%mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc108)
|
| 76 |
+
%equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc109)
|
| 77 |
+
%a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc94)
|
| 78 |
+
%b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc95)
|
| 79 |
+
%mask_20 = arith.xori %b_isnan, %true : i1 loc(#loc96)
|
| 80 |
+
%mask_21 = arith.andi %a_isnan, %mask_20 : i1 loc(#loc97)
|
| 81 |
+
%mask_22 = arith.ori %mask, %mask_21 : i1 loc(#loc110)
|
| 82 |
+
%equal_23 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc98)
|
| 83 |
+
%equal_24 = arith.ori %equal, %equal_23 : i1 loc(#loc111)
|
| 84 |
+
%mask_25 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc99)
|
| 85 |
+
%mask_26 = arith.andi %equal_24, %mask_25 : i1 loc(#loc100)
|
| 86 |
+
%mask_27 = arith.ori %mask_22, %mask_26 : i1 loc(#loc101)
|
| 87 |
+
%5 = arith.select %mask_27, %arg4, %arg6 : f32 loc(#loc102)
|
| 88 |
+
%6 = arith.select %mask_27, %arg5, %arg7 : i32 loc(#loc103)
|
| 89 |
+
tt.reduce.return %5, %6 : f32, i32 loc(#loc78)
|
| 90 |
+
}) : (tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked>) -> (tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc78)
|
| 91 |
+
%tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc80)
|
| 92 |
+
%1 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<8x1x!tt.ptr<i64>, #blocked1> loc(#loc39)
|
| 93 |
+
%2 = tt.addptr %1, %xindex_14 : tensor<8x1x!tt.ptr<i64>, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc39)
|
| 94 |
+
%3 = ttg.convert_layout %tmp2 : tensor<8x1xi32, #blocked> -> tensor<8x1xi32, #blocked1> loc(#loc40)
|
| 95 |
+
%4 = arith.extsi %3 : tensor<8x1xi32, #blocked1> to tensor<8x1xi64, #blocked1> loc(#loc40)
|
| 96 |
+
tt.store %2, %4 : tensor<8x1x!tt.ptr<i64>, #blocked1> loc(#loc40)
|
| 97 |
+
tt.return loc(#loc41)
|
| 98 |
+
} loc(#loc)
|
| 99 |
+
} loc(#loc)
|
| 100 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:28)
|
| 101 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:33)
|
| 102 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:44)
|
| 103 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:23)
|
| 104 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:37)
|
| 105 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":28:19)
|
| 106 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":29:19)
|
| 107 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:47)
|
| 108 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:41)
|
| 109 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:61)
|
| 110 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:52)
|
| 111 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:34)
|
| 112 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":33:40)
|
| 113 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":34:31)
|
| 114 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":35:29)
|
| 115 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:66)
|
| 116 |
+
#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
|
| 117 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":42:38)
|
| 118 |
+
#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
|
| 119 |
+
#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
|
| 120 |
+
#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
|
| 121 |
+
#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
|
| 122 |
+
#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
|
| 123 |
+
#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
|
| 124 |
+
#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
|
| 125 |
+
#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
|
| 126 |
+
#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
|
| 127 |
+
#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
|
| 128 |
+
#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
|
| 129 |
+
#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
|
| 130 |
+
#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
|
| 131 |
+
#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":44:46)
|
| 132 |
+
#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:58)
|
| 133 |
+
#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:8)
|
| 134 |
+
#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
|
| 135 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":47:20)
|
| 136 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:25)
|
| 137 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:36)
|
| 138 |
+
#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:4)
|
| 139 |
+
#loc46 = loc("xoffset"(#loc2))
|
| 140 |
+
#loc47 = loc("xoffset"(#loc3))
|
| 141 |
+
#loc48 = loc("xindex"(#loc4))
|
| 142 |
+
#loc49 = loc("xindex"(#loc5))
|
| 143 |
+
#loc50 = loc("r0_base"(#loc6))
|
| 144 |
+
#loc51 = loc("x0"(#loc7))
|
| 145 |
+
#loc52 = loc("x1"(#loc8))
|
| 146 |
+
#loc53 = loc("tmp0"(#loc9))
|
| 147 |
+
#loc54 = loc("tmp0"(#loc10))
|
| 148 |
+
#loc55 = loc("tmp0"(#loc11))
|
| 149 |
+
#loc56 = loc("tmp0"(#loc12))
|
| 150 |
+
#loc57 = loc("tmp0"(#loc13))
|
| 151 |
+
#loc58 = loc("_tmp2"(#loc14))
|
| 152 |
+
#loc59 = loc("r0_index"(#loc15))
|
| 153 |
+
#loc60 = loc("r0_mask"(#loc16))
|
| 154 |
+
#loc61 = loc("tmp0"(#loc17))
|
| 155 |
+
#loc62 = loc("mask"(#loc18))
|
| 156 |
+
#loc63 = loc("equal"(#loc20))
|
| 157 |
+
#loc64 = loc("a_isnan"(#loc21))
|
| 158 |
+
#loc65 = loc("b_isnan"(#loc22))
|
| 159 |
+
#loc66 = loc("mask"(#loc23))
|
| 160 |
+
#loc67 = loc("mask"(#loc24))
|
| 161 |
+
#loc68 = loc("mask"(#loc25))
|
| 162 |
+
#loc69 = loc("equal"(#loc26))
|
| 163 |
+
#loc70 = loc("equal"(#loc27))
|
| 164 |
+
#loc71 = loc("mask"(#loc28))
|
| 165 |
+
#loc72 = loc("mask"(#loc29))
|
| 166 |
+
#loc73 = loc("mask"(#loc30))
|
| 167 |
+
#loc74 = loc(callsite(#loc31 at #loc19))
|
| 168 |
+
#loc75 = loc(callsite(#loc32 at #loc19))
|
| 169 |
+
#loc76 = loc("_tmp2"(#loc33))
|
| 170 |
+
#loc77 = loc("_tmp2_index"(#loc34))
|
| 171 |
+
#loc78 = loc(callsite(#loc36 at #loc37))
|
| 172 |
+
#loc80 = loc("tmp2"(#loc38))
|
| 173 |
+
#loc81 = loc("_tmp2_index"(#loc58))
|
| 174 |
+
#loc82 = loc("mask"(#loc62))
|
| 175 |
+
#loc83 = loc("equal"(#loc63))
|
| 176 |
+
#loc84 = loc(callsite(#loc64 at #loc19))
|
| 177 |
+
#loc85 = loc(callsite(#loc65 at #loc19))
|
| 178 |
+
#loc86 = loc(callsite(#loc66 at #loc19))
|
| 179 |
+
#loc87 = loc(callsite(#loc67 at #loc19))
|
| 180 |
+
#loc88 = loc("mask"(#loc68))
|
| 181 |
+
#loc89 = loc(callsite(#loc69 at #loc19))
|
| 182 |
+
#loc90 = loc("equal"(#loc70))
|
| 183 |
+
#loc91 = loc(callsite(#loc71 at #loc19))
|
| 184 |
+
#loc92 = loc(callsite(#loc72 at #loc19))
|
| 185 |
+
#loc93 = loc(callsite(#loc73 at #loc19))
|
| 186 |
+
#loc94 = loc(callsite(#loc64 at #loc78))
|
| 187 |
+
#loc95 = loc(callsite(#loc65 at #loc78))
|
| 188 |
+
#loc96 = loc(callsite(#loc66 at #loc78))
|
| 189 |
+
#loc97 = loc(callsite(#loc67 at #loc78))
|
| 190 |
+
#loc98 = loc(callsite(#loc69 at #loc78))
|
| 191 |
+
#loc99 = loc(callsite(#loc71 at #loc78))
|
| 192 |
+
#loc100 = loc(callsite(#loc72 at #loc78))
|
| 193 |
+
#loc101 = loc(callsite(#loc73 at #loc78))
|
| 194 |
+
#loc102 = loc(callsite(#loc31 at #loc78))
|
| 195 |
+
#loc103 = loc(callsite(#loc32 at #loc78))
|
| 196 |
+
#loc104 = loc(callsite(#loc82 at #loc19))
|
| 197 |
+
#loc105 = loc(callsite(#loc83 at #loc19))
|
| 198 |
+
#loc106 = loc(callsite(#loc88 at #loc19))
|
| 199 |
+
#loc107 = loc(callsite(#loc90 at #loc19))
|
| 200 |
+
#loc108 = loc(callsite(#loc82 at #loc78))
|
| 201 |
+
#loc109 = loc(callsite(#loc83 at #loc78))
|
| 202 |
+
#loc110 = loc(callsite(#loc88 at #loc78))
|
| 203 |
+
#loc111 = loc(callsite(#loc90 at #loc78))
|
SpecForge-ext/cache/compiled_kernels/triton/3/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttir
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":18:0)
|
| 2 |
+
#loc1 = loc(unknown)
|
| 3 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":46:75)
|
| 4 |
+
#loc46 = loc("in_ptr0"(#loc))
|
| 5 |
+
#loc47 = loc("out_ptr0"(#loc))
|
| 6 |
+
#loc48 = loc("xnumel"(#loc))
|
| 7 |
+
#loc49 = loc("r0_numel"(#loc))
|
| 8 |
+
#loc50 = loc(callsite(#loc1 at #loc2))
|
| 9 |
+
module {
|
| 10 |
+
tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 11 |
+
%true = arith.constant true loc(#loc50)
|
| 12 |
+
%cst = arith.constant dense<true> : tensor<8x512xi1> loc(#loc1)
|
| 13 |
+
%c512_i32 = arith.constant 512 : i32 loc(#loc3)
|
| 14 |
+
%c32000_i32 = arith.constant 32000 : i32 loc(#loc3)
|
| 15 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
|
| 16 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<8x512xf32> loc(#loc1)
|
| 17 |
+
%cst_1 = arith.constant dense<65760000> : tensor<8x1xi32> loc(#loc1)
|
| 18 |
+
%cst_2 = arith.constant dense<32000> : tensor<8x1xi32> loc(#loc1)
|
| 19 |
+
%cst_3 = arith.constant dense<32000> : tensor<1x512xi32> loc(#loc1)
|
| 20 |
+
%_tmp2_index = arith.constant dense<2147483647> : tensor<8x512xi32> loc(#loc51)
|
| 21 |
+
%_tmp2 = arith.constant dense<0xFF800000> : tensor<8x512xf32> loc(#loc52)
|
| 22 |
+
%cst_4 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc1)
|
| 23 |
+
%c8_i32 = arith.constant 8 : i32 loc(#loc1)
|
| 24 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc53)
|
| 25 |
+
%xoffset_5 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc54)
|
| 26 |
+
%xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc55)
|
| 27 |
+
%xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc56)
|
| 28 |
+
%xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32> loc(#loc57)
|
| 29 |
+
%xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<8x1xi32> loc(#loc57)
|
| 30 |
+
%r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc58)
|
| 31 |
+
%r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32> -> tensor<1x512xi32> loc(#loc59)
|
| 32 |
+
%x0 = arith.remsi %xindex_8, %cst_4 : tensor<8x1xi32> loc(#loc60)
|
| 33 |
+
%x1 = arith.divsi %xindex_8, %cst_4 : tensor<8x1xi32> loc(#loc61)
|
| 34 |
+
%_tmp2_index_10:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c512_i32 iter_args(%_tmp2_11 = %_tmp2, %_tmp2_index_12 = %_tmp2_index) -> (tensor<8x512xf32>, tensor<8x512xi32>) : i32 {
|
| 35 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x512xi32> loc(#loc63)
|
| 36 |
+
%r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x512xi32> loc(#loc63)
|
| 37 |
+
%r0_mask = arith.cmpi slt, %r0_index_13, %cst_3 : tensor<1x512xi32> loc(#loc64)
|
| 38 |
+
%tmp0 = arith.muli %x0, %cst_2 : tensor<8x1xi32> loc(#loc65)
|
| 39 |
+
%tmp0_14 = tt.broadcast %r0_index_13 : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc66)
|
| 40 |
+
%tmp0_15 = tt.broadcast %tmp0 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc66)
|
| 41 |
+
%tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x512xi32> loc(#loc66)
|
| 42 |
+
%tmp0_17 = arith.muli %x1, %cst_1 : tensor<8x1xi32> loc(#loc67)
|
| 43 |
+
%tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc68)
|
| 44 |
+
%tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<8x512xi32> loc(#loc68)
|
| 45 |
+
%tmp0_20 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<8x512x!tt.ptr<f32>> loc(#loc69)
|
| 46 |
+
%tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<8x512x!tt.ptr<f32>>, tensor<8x512xi32> loc(#loc69)
|
| 47 |
+
%tmp0_22 = tt.broadcast %r0_mask : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc70)
|
| 48 |
+
%tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_0 evictionPolicy = evict_first : tensor<8x512x!tt.ptr<f32>> loc(#loc70)
|
| 49 |
+
%mask = arith.cmpf ogt, %_tmp2_11, %tmp0_23 : tensor<8x512xf32> loc(#loc112)
|
| 50 |
+
%equal = arith.cmpf oeq, %_tmp2_11, %tmp0_23 : tensor<8x512xf32> loc(#loc113)
|
| 51 |
+
%a_isnan = arith.cmpf une, %_tmp2_11, %_tmp2_11 : tensor<8x512xf32> loc(#loc92)
|
| 52 |
+
%b_isnan = arith.cmpf une, %tmp0_23, %tmp0_23 : tensor<8x512xf32> loc(#loc93)
|
| 53 |
+
%mask_24 = arith.xori %b_isnan, %cst : tensor<8x512xi1> loc(#loc94)
|
| 54 |
+
%mask_25 = arith.andi %a_isnan, %mask_24 : tensor<8x512xi1> loc(#loc95)
|
| 55 |
+
%mask_26 = arith.ori %mask, %mask_25 : tensor<8x512xi1> loc(#loc114)
|
| 56 |
+
%equal_27 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1> loc(#loc97)
|
| 57 |
+
%equal_28 = arith.ori %equal, %equal_27 : tensor<8x512xi1> loc(#loc115)
|
| 58 |
+
%mask_29 = arith.cmpi slt, %_tmp2_index_12, %tmp0_14 : tensor<8x512xi32> loc(#loc99)
|
| 59 |
+
%mask_30 = arith.andi %equal_28, %mask_29 : tensor<8x512xi1> loc(#loc100)
|
| 60 |
+
%mask_31 = arith.ori %mask_26, %mask_30 : tensor<8x512xi1> loc(#loc101)
|
| 61 |
+
%4 = arith.select %mask_31, %_tmp2_11, %tmp0_23 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc83)
|
| 62 |
+
%5 = arith.select %mask_31, %_tmp2_index_12, %tmp0_14 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc84)
|
| 63 |
+
%_tmp2_32 = arith.select %tmp0_22, %4, %_tmp2_11 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc85)
|
| 64 |
+
%_tmp2_index_33 = arith.select %tmp0_22, %5, %_tmp2_index_12 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc86)
|
| 65 |
+
scf.yield %_tmp2_32, %_tmp2_index_33 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc40)
|
| 66 |
+
} loc(#loc89)
|
| 67 |
+
%0:2 = "tt.reduce"(%_tmp2_index_10#0, %_tmp2_index_10#1) <{axis = 1 : i32}> ({
|
| 68 |
+
^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))):
|
| 69 |
+
%mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc116)
|
| 70 |
+
%equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc117)
|
| 71 |
+
%a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc102)
|
| 72 |
+
%b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc103)
|
| 73 |
+
%mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc104)
|
| 74 |
+
%mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc105)
|
| 75 |
+
%mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc118)
|
| 76 |
+
%equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc106)
|
| 77 |
+
%equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc119)
|
| 78 |
+
%mask_16 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc107)
|
| 79 |
+
%mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc108)
|
| 80 |
+
%mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc109)
|
| 81 |
+
%4 = arith.select %mask_18, %arg4, %arg6 : f32 loc(#loc110)
|
| 82 |
+
%5 = arith.select %mask_18, %arg5, %arg7 : i32 loc(#loc111)
|
| 83 |
+
tt.reduce.return %4, %5 : f32, i32 loc(#loc87)
|
| 84 |
+
}) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc87)
|
| 85 |
+
%tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc88)
|
| 86 |
+
%1 = tt.splat %out_ptr0 : !tt.ptr<i64> -> tensor<8x1x!tt.ptr<i64>> loc(#loc43)
|
| 87 |
+
%2 = tt.addptr %1, %xindex_8 : tensor<8x1x!tt.ptr<i64>>, tensor<8x1xi32> loc(#loc43)
|
| 88 |
+
%3 = arith.extsi %tmp2 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc44)
|
| 89 |
+
tt.store %2, %3 : tensor<8x1x!tt.ptr<i64>> loc(#loc44)
|
| 90 |
+
tt.return loc(#loc45)
|
| 91 |
+
} loc(#loc)
|
| 92 |
+
} loc(#loc)
|
| 93 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":33:40)
|
| 94 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":31:58)
|
| 95 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":30:55)
|
| 96 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:28)
|
| 97 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:33)
|
| 98 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:36)
|
| 99 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:44)
|
| 100 |
+
#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:23)
|
| 101 |
+
#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:27)
|
| 102 |
+
#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:37)
|
| 103 |
+
#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":28:19)
|
| 104 |
+
#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":29:19)
|
| 105 |
+
#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":34:31)
|
| 106 |
+
#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":35:29)
|
| 107 |
+
#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:47)
|
| 108 |
+
#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:41)
|
| 109 |
+
#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:61)
|
| 110 |
+
#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:52)
|
| 111 |
+
#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:34)
|
| 112 |
+
#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:66)
|
| 113 |
+
#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21)
|
| 114 |
+
#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":42:38)
|
| 115 |
+
#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23)
|
| 116 |
+
#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29)
|
| 117 |
+
#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29)
|
| 118 |
+
#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31)
|
| 119 |
+
#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27)
|
| 120 |
+
#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16)
|
| 121 |
+
#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27)
|
| 122 |
+
#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17)
|
| 123 |
+
#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31)
|
| 124 |
+
#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21)
|
| 125 |
+
#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12)
|
| 126 |
+
#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35)
|
| 127 |
+
#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69)
|
| 128 |
+
#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":44:46)
|
| 129 |
+
#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:58)
|
| 130 |
+
#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:8)
|
| 131 |
+
#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42)
|
| 132 |
+
#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":47:20)
|
| 133 |
+
#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:25)
|
| 134 |
+
#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:36)
|
| 135 |
+
#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:4)
|
| 136 |
+
#loc51 = loc("_tmp2_index"(#loc4))
|
| 137 |
+
#loc52 = loc("_tmp2"(#loc5))
|
| 138 |
+
#loc53 = loc("xoffset"(#loc6))
|
| 139 |
+
#loc54 = loc("xoffset"(#loc7))
|
| 140 |
+
#loc55 = loc("xindex"(#loc8))
|
| 141 |
+
#loc56 = loc("xindex"(#loc9))
|
| 142 |
+
#loc57 = loc("xindex"(#loc10))
|
| 143 |
+
#loc58 = loc("r0_base"(#loc11))
|
| 144 |
+
#loc59 = loc("r0_base"(#loc12))
|
| 145 |
+
#loc60 = loc("x0"(#loc13))
|
| 146 |
+
#loc61 = loc("x1"(#loc14))
|
| 147 |
+
#loc62 = loc("_tmp2"(#loc3))
|
| 148 |
+
#loc63 = loc("r0_index"(#loc15))
|
| 149 |
+
#loc64 = loc("r0_mask"(#loc16))
|
| 150 |
+
#loc65 = loc("tmp0"(#loc17))
|
| 151 |
+
#loc66 = loc("tmp0"(#loc18))
|
| 152 |
+
#loc67 = loc("tmp0"(#loc19))
|
| 153 |
+
#loc68 = loc("tmp0"(#loc20))
|
| 154 |
+
#loc69 = loc("tmp0"(#loc21))
|
| 155 |
+
#loc70 = loc("tmp0"(#loc22))
|
| 156 |
+
#loc71 = loc("mask"(#loc23))
|
| 157 |
+
#loc72 = loc("equal"(#loc25))
|
| 158 |
+
#loc73 = loc("a_isnan"(#loc26))
|
| 159 |
+
#loc74 = loc("b_isnan"(#loc27))
|
| 160 |
+
#loc75 = loc("mask"(#loc28))
|
| 161 |
+
#loc76 = loc("mask"(#loc29))
|
| 162 |
+
#loc77 = loc("mask"(#loc30))
|
| 163 |
+
#loc78 = loc("equal"(#loc31))
|
| 164 |
+
#loc79 = loc("equal"(#loc32))
|
| 165 |
+
#loc80 = loc("mask"(#loc33))
|
| 166 |
+
#loc81 = loc("mask"(#loc34))
|
| 167 |
+
#loc82 = loc("mask"(#loc35))
|
| 168 |
+
#loc83 = loc(callsite(#loc36 at #loc24))
|
| 169 |
+
#loc84 = loc(callsite(#loc37 at #loc24))
|
| 170 |
+
#loc85 = loc("_tmp2"(#loc38))
|
| 171 |
+
#loc86 = loc("_tmp2_index"(#loc39))
|
| 172 |
+
#loc87 = loc(callsite(#loc41 at #loc2))
|
| 173 |
+
#loc88 = loc("tmp2"(#loc42))
|
| 174 |
+
#loc89 = loc("_tmp2_index"(#loc62))
|
| 175 |
+
#loc90 = loc("mask"(#loc71))
|
| 176 |
+
#loc91 = loc("equal"(#loc72))
|
| 177 |
+
#loc92 = loc(callsite(#loc73 at #loc24))
|
| 178 |
+
#loc93 = loc(callsite(#loc74 at #loc24))
|
| 179 |
+
#loc94 = loc(callsite(#loc75 at #loc24))
|
| 180 |
+
#loc95 = loc(callsite(#loc76 at #loc24))
|
| 181 |
+
#loc96 = loc("mask"(#loc77))
|
| 182 |
+
#loc97 = loc(callsite(#loc78 at #loc24))
|
| 183 |
+
#loc98 = loc("equal"(#loc79))
|
| 184 |
+
#loc99 = loc(callsite(#loc80 at #loc24))
|
| 185 |
+
#loc100 = loc(callsite(#loc81 at #loc24))
|
| 186 |
+
#loc101 = loc(callsite(#loc82 at #loc24))
|
| 187 |
+
#loc102 = loc(callsite(#loc73 at #loc87))
|
| 188 |
+
#loc103 = loc(callsite(#loc74 at #loc87))
|
| 189 |
+
#loc104 = loc(callsite(#loc75 at #loc87))
|
| 190 |
+
#loc105 = loc(callsite(#loc76 at #loc87))
|
| 191 |
+
#loc106 = loc(callsite(#loc78 at #loc87))
|
| 192 |
+
#loc107 = loc(callsite(#loc80 at #loc87))
|
| 193 |
+
#loc108 = loc(callsite(#loc81 at #loc87))
|
| 194 |
+
#loc109 = loc(callsite(#loc82 at #loc87))
|
| 195 |
+
#loc110 = loc(callsite(#loc36 at #loc87))
|
| 196 |
+
#loc111 = loc(callsite(#loc37 at #loc87))
|
| 197 |
+
#loc112 = loc(callsite(#loc90 at #loc24))
|
| 198 |
+
#loc113 = loc(callsite(#loc91 at #loc24))
|
| 199 |
+
#loc114 = loc(callsite(#loc96 at #loc24))
|
| 200 |
+
#loc115 = loc(callsite(#loc98 at #loc24))
|
| 201 |
+
#loc116 = loc(callsite(#loc90 at #loc87))
|
| 202 |
+
#loc117 = loc(callsite(#loc91 at #loc87))
|
| 203 |
+
#loc118 = loc(callsite(#loc96 at #loc87))
|
| 204 |
+
#loc119 = loc(callsite(#loc98 at #loc87))
|
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/__grp__triton_poi_fused_new_zeros_0.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_poi_fused_new_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.source", "triton_poi_fused_new_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttir", "triton_poi_fused_new_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttgir", "triton_poi_fused_new_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.llir", "triton_poi_fused_new_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ptx", "triton_poi_fused_new_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.cubin", "triton_poi_fused_new_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.json"}}
|
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.cubin
ADDED
|
Binary file (5.77 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "e69b538dd589c5b9c913d33cdaa69fd2fa4ad6662ded2e7cc313f4666af16fd4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_new_zeros_0"}
|
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.llir
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
; Function Attrs: nounwind
|
| 6 |
+
define ptx_kernel void @triton_poi_fused_new_zeros_0(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) readnone captures(none) %2, ptr addrspace(1) readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 {
|
| 7 |
+
%5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
|
| 8 |
+
%6 = shl i32 %5, 8, !dbg !8
|
| 9 |
+
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
|
| 10 |
+
%8 = shl nuw nsw i32 %7, 1, !dbg !9
|
| 11 |
+
%9 = and i32 %8, 254, !dbg !9
|
| 12 |
+
%10 = or disjoint i32 %9, %6, !dbg !10
|
| 13 |
+
%11 = or disjoint i32 %10, 1, !dbg !10
|
| 14 |
+
%12 = icmp slt i32 %10, %1, !dbg !11
|
| 15 |
+
%13 = icmp slt i32 %11, %1, !dbg !11
|
| 16 |
+
%14 = sext i32 %10 to i64, !dbg !12
|
| 17 |
+
%15 = getelementptr i32, ptr addrspace(1) %0, i64 %14, !dbg !12
|
| 18 |
+
%16 = sext i32 %11 to i64, !dbg !12
|
| 19 |
+
%17 = getelementptr i32, ptr addrspace(1) %0, i64 %16, !dbg !12
|
| 20 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 0, ptr addrspace(1) %15, i1 %12) #2, !dbg !13
|
| 21 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 0, ptr addrspace(1) %17, i1 %13) #2, !dbg !13
|
| 22 |
+
ret void, !dbg !14
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 26 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 27 |
+
|
| 28 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 29 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 30 |
+
|
| 31 |
+
attributes #0 = { nounwind "nvvm.reqntid"="128" }
|
| 32 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 33 |
+
attributes #2 = { nounwind }
|
| 34 |
+
|
| 35 |
+
!llvm.dbg.cu = !{!0}
|
| 36 |
+
!llvm.module.flags = !{!2, !3}
|
| 37 |
+
|
| 38 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 39 |
+
!1 = !DIFile(filename: "cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6")
|
| 40 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 41 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 42 |
+
!4 = distinct !DISubprogram(name: "triton_poi_fused_new_zeros_0", linkageName: "triton_poi_fused_new_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 43 |
+
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
|
| 44 |
+
!6 = !{}
|
| 45 |
+
!7 = !DILocation(line: 19, column: 28, scope: !4)
|
| 46 |
+
!8 = !DILocation(line: 19, column: 33, scope: !4)
|
| 47 |
+
!9 = !DILocation(line: 20, column: 36, scope: !4)
|
| 48 |
+
!10 = !DILocation(line: 20, column: 23, scope: !4)
|
| 49 |
+
!11 = !DILocation(line: 21, column: 21, scope: !4)
|
| 50 |
+
!12 = !DILocation(line: 24, column: 25, scope: !4)
|
| 51 |
+
!13 = !DILocation(line: 24, column: 36, scope: !4)
|
| 52 |
+
!14 = !DILocation(line: 24, column: 4, scope: !4)
|
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ptx
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_poi_fused_new_zeros_0 // -- Begin function triton_poi_fused_new_zeros_0
|
| 10 |
+
// @triton_poi_fused_new_zeros_0
|
| 11 |
+
.visible .entry triton_poi_fused_new_zeros_0(
|
| 12 |
+
.param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_0,
|
| 13 |
+
.param .u32 triton_poi_fused_new_zeros_0_param_1,
|
| 14 |
+
.param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_2,
|
| 15 |
+
.param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_3
|
| 16 |
+
)
|
| 17 |
+
.reqntid 128
|
| 18 |
+
{
|
| 19 |
+
.reg .pred %p<3>;
|
| 20 |
+
.reg .b32 %r<11>;
|
| 21 |
+
.reg .b64 %rd<4>;
|
| 22 |
+
.loc 1 18 0 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:18:0
|
| 23 |
+
$L__func_begin0:
|
| 24 |
+
.loc 1 18 0 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:18:0
|
| 25 |
+
|
| 26 |
+
// %bb.0:
|
| 27 |
+
ld.param.b64 %rd3, [triton_poi_fused_new_zeros_0_param_0];
|
| 28 |
+
ld.param.b32 %r3, [triton_poi_fused_new_zeros_0_param_1];
|
| 29 |
+
$L__tmp0:
|
| 30 |
+
.loc 1 19 28 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:19:28
|
| 31 |
+
mov.u32 %r4, %ctaid.x;
|
| 32 |
+
.loc 1 19 33 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:19:33
|
| 33 |
+
shl.b32 %r5, %r4, 8;
|
| 34 |
+
.loc 1 20 36 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:20:36
|
| 35 |
+
mov.u32 %r6, %tid.x;
|
| 36 |
+
shl.b32 %r7, %r6, 1;
|
| 37 |
+
and.b32 %r8, %r7, 254;
|
| 38 |
+
.loc 1 20 23 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:20:23
|
| 39 |
+
or.b32 %r9, %r8, %r5;
|
| 40 |
+
or.b32 %r10, %r9, 1;
|
| 41 |
+
.loc 1 21 21 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:21:21
|
| 42 |
+
setp.lt.s32 %p1, %r9, %r3;
|
| 43 |
+
setp.lt.s32 %p2, %r10, %r3;
|
| 44 |
+
.loc 1 24 25 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:24:25
|
| 45 |
+
mad.wide.s32 %rd1, %r9, 4, %rd3;
|
| 46 |
+
add.s64 %rd2, %rd1, 4;
|
| 47 |
+
mov.b32 %r1, 0;
|
| 48 |
+
.loc 1 24 36 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:24:36
|
| 49 |
+
// begin inline asm
|
| 50 |
+
@%p1 st.global.b32 [ %rd1 + 0 ], { %r1 };
|
| 51 |
+
// end inline asm
|
| 52 |
+
// begin inline asm
|
| 53 |
+
@%p2 st.global.b32 [ %rd2 + 0 ], { %r1 };
|
| 54 |
+
// end inline asm
|
| 55 |
+
.loc 1 24 4 // cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py:24:4
|
| 56 |
+
ret;
|
| 57 |
+
$L__tmp1:
|
| 58 |
+
$L__func_end0:
|
| 59 |
+
// -- End function
|
| 60 |
+
}
|
| 61 |
+
.file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py"
|
| 62 |
+
.section .debug_abbrev
|
| 63 |
+
{
|
| 64 |
+
.b8 1 // Abbreviation Code
|
| 65 |
+
.b8 17 // DW_TAG_compile_unit
|
| 66 |
+
.b8 0 // DW_CHILDREN_no
|
| 67 |
+
.b8 37 // DW_AT_producer
|
| 68 |
+
.b8 8 // DW_FORM_string
|
| 69 |
+
.b8 19 // DW_AT_language
|
| 70 |
+
.b8 5 // DW_FORM_data2
|
| 71 |
+
.b8 3 // DW_AT_name
|
| 72 |
+
.b8 8 // DW_FORM_string
|
| 73 |
+
.b8 16 // DW_AT_stmt_list
|
| 74 |
+
.b8 6 // DW_FORM_data4
|
| 75 |
+
.b8 27 // DW_AT_comp_dir
|
| 76 |
+
.b8 8 // DW_FORM_string
|
| 77 |
+
.b8 0 // EOM(1)
|
| 78 |
+
.b8 0 // EOM(2)
|
| 79 |
+
.b8 0 // EOM(3)
|
| 80 |
+
}
|
| 81 |
+
.section .debug_info
|
| 82 |
+
{
|
| 83 |
+
.b32 135 // Length of Unit
|
| 84 |
+
.b8 2 // DWARF version number
|
| 85 |
+
.b8 0
|
| 86 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 87 |
+
.b8 8 // Address Size (in bytes)
|
| 88 |
+
.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit
|
| 89 |
+
.b8 116 // DW_AT_producer
|
| 90 |
+
.b8 114
|
| 91 |
+
.b8 105
|
| 92 |
+
.b8 116
|
| 93 |
+
.b8 111
|
| 94 |
+
.b8 110
|
| 95 |
+
.b8 0
|
| 96 |
+
.b8 2 // DW_AT_language
|
| 97 |
+
.b8 0
|
| 98 |
+
.b8 99 // DW_AT_name
|
| 99 |
+
.b8 106
|
| 100 |
+
.b8 54
|
| 101 |
+
.b8 108
|
| 102 |
+
.b8 98
|
| 103 |
+
.b8 50
|
| 104 |
+
.b8 108
|
| 105 |
+
.b8 119
|
| 106 |
+
.b8 97
|
| 107 |
+
.b8 98
|
| 108 |
+
.b8 52
|
| 109 |
+
.b8 51
|
| 110 |
+
.b8 122
|
| 111 |
+
.b8 118
|
| 112 |
+
.b8 101
|
| 113 |
+
.b8 108
|
| 114 |
+
.b8 51
|
| 115 |
+
.b8 52
|
| 116 |
+
.b8 122
|
| 117 |
+
.b8 51
|
| 118 |
+
.b8 119
|
| 119 |
+
.b8 115
|
| 120 |
+
.b8 100
|
| 121 |
+
.b8 122
|
| 122 |
+
.b8 103
|
| 123 |
+
.b8 106
|
| 124 |
+
.b8 110
|
| 125 |
+
.b8 115
|
| 126 |
+
.b8 55
|
| 127 |
+
.b8 101
|
| 128 |
+
.b8 102
|
| 129 |
+
.b8 119
|
| 130 |
+
.b8 121
|
| 131 |
+
.b8 118
|
| 132 |
+
.b8 106
|
| 133 |
+
.b8 100
|
| 134 |
+
.b8 50
|
| 135 |
+
.b8 121
|
| 136 |
+
.b8 99
|
| 137 |
+
.b8 101
|
| 138 |
+
.b8 120
|
| 139 |
+
.b8 113
|
| 140 |
+
.b8 106
|
| 141 |
+
.b8 51
|
| 142 |
+
.b8 98
|
| 143 |
+
.b8 110
|
| 144 |
+
.b8 97
|
| 145 |
+
.b8 121
|
| 146 |
+
.b8 105
|
| 147 |
+
.b8 118
|
| 148 |
+
.b8 104
|
| 149 |
+
.b8 54
|
| 150 |
+
.b8 46
|
| 151 |
+
.b8 112
|
| 152 |
+
.b8 121
|
| 153 |
+
.b8 0
|
| 154 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 155 |
+
.b8 47 // DW_AT_comp_dir
|
| 156 |
+
.b8 119
|
| 157 |
+
.b8 111
|
| 158 |
+
.b8 114
|
| 159 |
+
.b8 107
|
| 160 |
+
.b8 115
|
| 161 |
+
.b8 112
|
| 162 |
+
.b8 97
|
| 163 |
+
.b8 99
|
| 164 |
+
.b8 101
|
| 165 |
+
.b8 47
|
| 166 |
+
.b8 104
|
| 167 |
+
.b8 97
|
| 168 |
+
.b8 110
|
| 169 |
+
.b8 114
|
| 170 |
+
.b8 117
|
| 171 |
+
.b8 105
|
| 172 |
+
.b8 47
|
| 173 |
+
.b8 83
|
| 174 |
+
.b8 112
|
| 175 |
+
.b8 101
|
| 176 |
+
.b8 99
|
| 177 |
+
.b8 70
|
| 178 |
+
.b8 111
|
| 179 |
+
.b8 114
|
| 180 |
+
.b8 103
|
| 181 |
+
.b8 101
|
| 182 |
+
.b8 45
|
| 183 |
+
.b8 101
|
| 184 |
+
.b8 120
|
| 185 |
+
.b8 116
|
| 186 |
+
.b8 47
|
| 187 |
+
.b8 99
|
| 188 |
+
.b8 97
|
| 189 |
+
.b8 99
|
| 190 |
+
.b8 104
|
| 191 |
+
.b8 101
|
| 192 |
+
.b8 47
|
| 193 |
+
.b8 99
|
| 194 |
+
.b8 111
|
| 195 |
+
.b8 109
|
| 196 |
+
.b8 112
|
| 197 |
+
.b8 105
|
| 198 |
+
.b8 108
|
| 199 |
+
.b8 101
|
| 200 |
+
.b8 100
|
| 201 |
+
.b8 95
|
| 202 |
+
.b8 107
|
| 203 |
+
.b8 101
|
| 204 |
+
.b8 114
|
| 205 |
+
.b8 110
|
| 206 |
+
.b8 101
|
| 207 |
+
.b8 108
|
| 208 |
+
.b8 115
|
| 209 |
+
.b8 47
|
| 210 |
+
.b8 106
|
| 211 |
+
.b8 54
|
| 212 |
+
.b8 0
|
| 213 |
+
}
|
| 214 |
+
.section .debug_macinfo { }
|
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.source
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":18:0)
|
| 2 |
+
#loc10 = loc("out_ptr0"(#loc))
|
| 3 |
+
#loc11 = loc("xnumel"(#loc))
|
| 4 |
+
module {
|
| 5 |
+
tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
|
| 6 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc12)
|
| 7 |
+
%xoffset_0 = arith.constant 256 : i32 loc(#loc13)
|
| 8 |
+
%xoffset_1 = arith.constant 256 : i32 loc(#loc13)
|
| 9 |
+
%xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc13)
|
| 10 |
+
%xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc14)
|
| 11 |
+
%xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<256xi32> loc(#loc15)
|
| 12 |
+
%xindex_4 = arith.addi %xindex_3, %xindex : tensor<256xi32> loc(#loc15)
|
| 13 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc16)
|
| 14 |
+
%xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<256xi32> loc(#loc16)
|
| 15 |
+
%tmp0 = arith.constant 0 : i32 loc(#loc17)
|
| 16 |
+
%tmp0_6 = arith.constant dense<0> : tensor<1xi32> loc(#loc17)
|
| 17 |
+
%0 = tt.splat %out_ptr0 : !tt.ptr<i32> -> tensor<256x!tt.ptr<i32>> loc(#loc7)
|
| 18 |
+
%1 = tt.addptr %0, %xindex_4 : tensor<256x!tt.ptr<i32>>, tensor<256xi32> loc(#loc7)
|
| 19 |
+
%cst = arith.constant dense<0> : tensor<256xi32> loc(#loc8)
|
| 20 |
+
tt.store %1, %cst, %xmask_5 : tensor<256x!tt.ptr<i32>> loc(#loc8)
|
| 21 |
+
tt.return loc(#loc9)
|
| 22 |
+
} loc(#loc)
|
| 23 |
+
} loc(#loc)
|
| 24 |
+
#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:28)
|
| 25 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:33)
|
| 26 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:36)
|
| 27 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:23)
|
| 28 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":21:21)
|
| 29 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":23:27)
|
| 30 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:25)
|
| 31 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:36)
|
| 32 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:4)
|
| 33 |
+
#loc12 = loc("xoffset"(#loc1))
|
| 34 |
+
#loc13 = loc("xoffset"(#loc2))
|
| 35 |
+
#loc14 = loc("xindex"(#loc3))
|
| 36 |
+
#loc15 = loc("xindex"(#loc4))
|
| 37 |
+
#loc16 = loc("xmask"(#loc5))
|
| 38 |
+
#loc17 = loc("tmp0"(#loc6))
|
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttgir
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
|
| 2 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":18:0)
|
| 3 |
+
#loc10 = loc("out_ptr0"(#loc))
|
| 4 |
+
#loc11 = loc("xnumel"(#loc))
|
| 5 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 6 |
+
tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
|
| 7 |
+
%c256_i32 = arith.constant 256 : i32 loc(#loc1)
|
| 8 |
+
%cst = arith.constant dense<0> : tensor<256xi32, #blocked> loc(#loc1)
|
| 9 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc12)
|
| 10 |
+
%xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc13)
|
| 11 |
+
%xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> loc(#loc14)
|
| 12 |
+
%xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32, #blocked> loc(#loc15)
|
| 13 |
+
%xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32, #blocked> loc(#loc15)
|
| 14 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<256xi32, #blocked> loc(#loc16)
|
| 15 |
+
%xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32, #blocked> loc(#loc16)
|
| 16 |
+
%0 = tt.splat %out_ptr0 : !tt.ptr<i32> -> tensor<256x!tt.ptr<i32>, #blocked> loc(#loc7)
|
| 17 |
+
%1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr<i32>, #blocked>, tensor<256xi32, #blocked> loc(#loc7)
|
| 18 |
+
tt.store %1, %cst, %xmask_3 : tensor<256x!tt.ptr<i32>, #blocked> loc(#loc8)
|
| 19 |
+
tt.return loc(#loc9)
|
| 20 |
+
} loc(#loc)
|
| 21 |
+
} loc(#loc)
|
| 22 |
+
#loc1 = loc(unknown)
|
| 23 |
+
#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:28)
|
| 24 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:33)
|
| 25 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:36)
|
| 26 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:23)
|
| 27 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":21:21)
|
| 28 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:25)
|
| 29 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:36)
|
| 30 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:4)
|
| 31 |
+
#loc12 = loc("xoffset"(#loc2))
|
| 32 |
+
#loc13 = loc("xoffset"(#loc3))
|
| 33 |
+
#loc14 = loc("xindex"(#loc4))
|
| 34 |
+
#loc15 = loc("xindex"(#loc5))
|
| 35 |
+
#loc16 = loc("xmask"(#loc6))
|
SpecForge-ext/cache/compiled_kernels/triton/3/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttir
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":18:0)
|
| 2 |
+
#loc10 = loc("out_ptr0"(#loc))
|
| 3 |
+
#loc11 = loc("xnumel"(#loc))
|
| 4 |
+
module {
|
| 5 |
+
tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
|
| 6 |
+
%cst = arith.constant dense<0> : tensor<256xi32> loc(#loc1)
|
| 7 |
+
%c256_i32 = arith.constant 256 : i32 loc(#loc2)
|
| 8 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc12)
|
| 9 |
+
%xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc13)
|
| 10 |
+
%xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc14)
|
| 11 |
+
%xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32> loc(#loc15)
|
| 12 |
+
%xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32> loc(#loc15)
|
| 13 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc16)
|
| 14 |
+
%xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32> loc(#loc16)
|
| 15 |
+
%0 = tt.splat %out_ptr0 : !tt.ptr<i32> -> tensor<256x!tt.ptr<i32>> loc(#loc8)
|
| 16 |
+
%1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr<i32>>, tensor<256xi32> loc(#loc8)
|
| 17 |
+
tt.store %1, %cst, %xmask_3 : tensor<256x!tt.ptr<i32>> loc(#loc1)
|
| 18 |
+
tt.return loc(#loc9)
|
| 19 |
+
} loc(#loc)
|
| 20 |
+
} loc(#loc)
|
| 21 |
+
#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:36)
|
| 22 |
+
#loc2 = loc(unknown)
|
| 23 |
+
#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:28)
|
| 24 |
+
#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":19:33)
|
| 25 |
+
#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:36)
|
| 26 |
+
#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":20:23)
|
| 27 |
+
#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":21:21)
|
| 28 |
+
#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:25)
|
| 29 |
+
#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j6/cj6lb2lwab43zvel34z3wsdzgjns7efwyvjd2ycexqj3bnayivh6.py":24:4)
|
| 30 |
+
#loc12 = loc("xoffset"(#loc3))
|
| 31 |
+
#loc13 = loc("xoffset"(#loc4))
|
| 32 |
+
#loc14 = loc("xindex"(#loc5))
|
| 33 |
+
#loc15 = loc("xindex"(#loc6))
|
| 34 |
+
#loc16 = loc("xmask"(#loc7))
|
SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/__grp__triton_red_fused_zeros_0.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.json"}}
|
SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.cubin
ADDED
|
Binary file (15.7 kB). View file
|
|
|
SpecForge-ext/cache/compiled_kernels/triton/3/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "e1b3f1564b04bc7aede55cff09d6f5ffbd51b34448fc5f1195c8448b5e81fed8", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"}
|