Lekr0 commited on
Commit
f63bfc5
·
verified ·
1 Parent(s): 0146652

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json +1 -0
  2. SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin +0 -0
  3. SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json +1 -0
  4. SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx +525 -0
  5. SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir +147 -0
  6. SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir +152 -0
  7. SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json +1 -0
  8. SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin +0 -0
  9. SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json +1 -0
  10. SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir +0 -0
  11. SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx +0 -0
  12. SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source +0 -0
  13. SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir +841 -0
  14. SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir +799 -0
  15. SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.cubin +0 -0
  16. SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.llir +220 -0
  17. SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.ptx +522 -0
  18. SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.source +120 -0
  19. SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.ttgir +81 -0
  20. SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.ttir +84 -0
  21. SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json +1 -0
  22. SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin +0 -0
  23. SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir +233 -0
  24. SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json +1 -0
  25. SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin +0 -0
  26. SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json +1 -0
  27. SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir +808 -0
  28. SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx +1936 -0
  29. SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source +419 -0
  30. SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir +284 -0
  31. SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir +283 -0
  32. SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/__grp__triton_red_fused_zeros_0.json +1 -0
  33. SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.cubin +0 -0
  34. SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.json +1 -0
  35. SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.llir +170 -0
  36. SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.ptx +411 -0
  37. SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.source +222 -0
  38. SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.ttgir +155 -0
  39. SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.ttir +152 -0
  40. SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.cubin +0 -0
  41. SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/__grp__triton_poi_fused__to_copy_6.json +1 -0
  42. SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.cubin +0 -0
  43. SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.json +1 -0
  44. SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.llir +89 -0
  45. SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ptx +311 -0
  46. SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.source +226 -0
  47. SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttgir +122 -0
  48. SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttir +121 -0
  49. SpecForge-ext/cache/compiled_kernels/triton/6/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/__grp__triton_red_fused_argmax_0.json +1 -0
  50. SpecForge-ext/cache/compiled_kernels/triton/6/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.cubin +0 -0
SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_red_fused__to_copy_clone_slice_sum_transpose_5.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json"}}
SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin ADDED
Binary file (17.2 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "d1c2e6527ce27b628a96c1a250025b39aad19d679fce295e820390aa7ae64b66", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_clone_slice_sum_transpose_5"}
SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_red_fused__to_copy_clone_slice_sum_transpose_5 // -- Begin function triton_red_fused__to_copy_clone_slice_sum_transpose_5
10
+ .extern .shared .align 16 .b8 global_smem[];
11
+ // @triton_red_fused__to_copy_clone_slice_sum_transpose_5
12
+ .visible .entry triton_red_fused__to_copy_clone_slice_sum_transpose_5(
13
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0,
14
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1,
15
+ .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2,
16
+ .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3,
17
+ .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4,
18
+ .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5,
19
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_6,
20
+ .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_7
21
+ )
22
+ .reqntid 128
23
+ {
24
+ .reg .pred %p<24>;
25
+ .reg .b32 %r<51>;
26
+ .reg .b64 %rd<97>;
27
+ .loc 1 18 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:18:0
28
+ $L__func_begin0:
29
+ .loc 1 18 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:18:0
30
+
31
+ // %bb.0:
32
+ ld.param.b32 %r11, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5];
33
+ ld.param.b64 %rd20, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2];
34
+ $L__tmp0:
35
+ .loc 1 21 28 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:21:28
36
+ mov.u32 %r12, %ctaid.x;
37
+ .loc 1 21 33 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:21:33
38
+ shl.b32 %r13, %r12, 5;
39
+ .loc 1 22 44 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:44
40
+ mov.u32 %r1, %tid.x;
41
+ and.b32 %r2, %r1, 31;
42
+ .loc 1 22 23 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:23
43
+ or.b32 %r14, %r13, %r2;
44
+ .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19
45
+ cvt.s64.s32 %rd1, %r14;
46
+ .loc 1 27 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:27:19
47
+ or.b64 %rd23, %rd1, %rd20;
48
+ and.b64 %rd24, %rd23, -4294967296;
49
+ setp.ne.b64 %p5, %rd24, 0;
50
+ cvt.u32.u64 %r49, %rd1;
51
+ @%p5 bra $L__BB0_2;
52
+ bra.uni $L__BB0_1;
53
+ $L__BB0_2:
54
+ div.s64 %rd91, %rd1, %rd20;
55
+ bra.uni $L__BB0_3;
56
+ $L__BB0_1:
57
+ cvt.u32.u64 %r15, %rd20;
58
+ div.u32 %r17, %r49, %r15;
59
+ cvt.u64.u32 %rd91, %r17;
60
+ $L__BB0_3:
61
+ .loc 1 0 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:19
62
+ ld.param.b32 %r10, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4];
63
+ ld.param.b64 %rd19, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1];
64
+ bfe.u32 %r3, %r1, 5, 2;
65
+ .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19
66
+ mul.lo.s64 %rd26, %rd91, %rd20;
67
+ sub.s64 %rd6, %rd1, %rd26;
68
+ .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40
69
+ setp.lt.s32 %p6, %r11, 1;
70
+ mov.b64 %rd96, 0;
71
+ shl.b64 %rd90, %rd6, 2;
72
+ @%p6 bra $L__BB0_7;
73
+ // %bb.4: // %.lr.ph
74
+ .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40
75
+ ld.param.b64 %rd21, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3];
76
+ ld.param.b64 %rd18, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0];
77
+ .loc 1 23 21 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:23:21
78
+ setp.lt.s32 %p1, %r49, %r10;
79
+ .loc 1 36 54 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:54
80
+ mul.lo.s64 %rd31, %rd21, %rd20;
81
+ .loc 1 36 58 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:58
82
+ mul.lo.s64 %rd32, %rd31, %rd91;
83
+ add.s64 %rd34, %rd18, %rd90;
84
+ .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40
85
+ shl.b64 %rd35, %rd32, 2;
86
+ add.s64 %rd7, %rd34, %rd35;
87
+ mov.b64 %rd92, 0;
88
+ mov.b32 %r50, 0;
89
+ mov.b64 %rd93, %rd92;
90
+ mov.b64 %rd94, %rd92;
91
+ mov.b64 %rd95, %rd92;
92
+ $L__BB0_5: // =>This Inner Loop Header: Depth=1
93
+ .loc 1 36 63 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:63
94
+ // begin inline asm
95
+ mov.u64 %rd36, 0x0;
96
+ createpolicy.fractional.L2::evict_last.b64 %rd36, 1.0;
97
+ // end inline asm
98
+ .loc 1 31 31 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:31:31
99
+ add.s32 %r24, %r3, %r50;
100
+ add.s32 %r25, %r24, 4;
101
+ add.s32 %r26, %r24, 8;
102
+ .loc 1 32 29 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:32:29
103
+ add.s32 %r27, %r24, 12;
104
+ setp.lt.s32 %p11, %r24, %r11;
105
+ setp.lt.s32 %p12, %r25, %r11;
106
+ setp.lt.s32 %p13, %r26, %r11;
107
+ setp.lt.s32 %p14, %r27, %r11;
108
+ .loc 1 36 43 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:43
109
+ cvt.s64.s32 %rd48, %r24;
110
+ cvt.s64.s32 %rd49, %r25;
111
+ cvt.s64.s32 %rd50, %r26;
112
+ cvt.s64.s32 %rd51, %r27;
113
+ mul.lo.s64 %rd52, %rd20, %rd48;
114
+ mul.lo.s64 %rd53, %rd20, %rd49;
115
+ mul.lo.s64 %rd54, %rd20, %rd50;
116
+ mul.lo.s64 %rd55, %rd20, %rd51;
117
+ .loc 1 36 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:34
118
+ shl.b64 %rd56, %rd52, 2;
119
+ add.s64 %rd37, %rd7, %rd56;
120
+ shl.b64 %rd57, %rd53, 2;
121
+ add.s64 %rd40, %rd7, %rd57;
122
+ shl.b64 %rd58, %rd54, 2;
123
+ add.s64 %rd43, %rd7, %rd58;
124
+ shl.b64 %rd59, %rd55, 2;
125
+ add.s64 %rd46, %rd7, %rd59;
126
+ .loc 1 36 73 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:73
127
+ and.pred %p10, %p1, %p14;
128
+ and.pred %p9, %p1, %p13;
129
+ and.pred %p8, %p1, %p12;
130
+ and.pred %p7, %p1, %p11;
131
+ .loc 1 36 63 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:63
132
+ // begin inline asm
133
+ mov.u32 %r20, 0x0;
134
+ @%p7 ld.global.L1::evict_last.L2::cache_hint.b32 { %r20 }, [ %rd37 + 0 ], %rd36;
135
+ // end inline asm
136
+ // begin inline asm
137
+ mov.u64 %rd39, 0x0;
138
+ createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0;
139
+ // end inline asm
140
+ // begin inline asm
141
+ mov.u32 %r21, 0x0;
142
+ @%p8 ld.global.L1::evict_last.L2::cache_hint.b32 { %r21 }, [ %rd40 + 0 ], %rd39;
143
+ // end inline asm
144
+ // begin inline asm
145
+ mov.u64 %rd42, 0x0;
146
+ createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0;
147
+ // end inline asm
148
+ // begin inline asm
149
+ mov.u32 %r22, 0x0;
150
+ @%p9 ld.global.L1::evict_last.L2::cache_hint.b32 { %r22 }, [ %rd43 + 0 ], %rd42;
151
+ // end inline asm
152
+ // begin inline asm
153
+ mov.u64 %rd45, 0x0;
154
+ createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0;
155
+ // end inline asm
156
+ // begin inline asm
157
+ mov.u32 %r23, 0x0;
158
+ @%p10 ld.global.L1::evict_last.L2::cache_hint.b32 { %r23 }, [ %rd46 + 0 ], %rd45;
159
+ // end inline asm
160
+ .loc 1 37 23 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:37:23
161
+ cvt.s64.s32 %rd60, %r20;
162
+ cvt.s64.s32 %rd61, %r21;
163
+ cvt.s64.s32 %rd62, %r22;
164
+ cvt.s64.s32 %rd63, %r23;
165
+ .loc 1 40 48 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:40:48
166
+ selp.b64 %rd64, %rd63, 0, %p10;
167
+ selp.b64 %rd65, %rd62, 0, %p9;
168
+ selp.b64 %rd66, %rd61, 0, %p8;
169
+ selp.b64 %rd67, %rd60, 0, %p7;
170
+ add.s64 %rd92, %rd67, %rd92;
171
+ add.s64 %rd93, %rd66, %rd93;
172
+ add.s64 %rd94, %rd65, %rd94;
173
+ add.s64 %rd95, %rd64, %rd95;
174
+ .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40
175
+ add.s32 %r50, %r50, 16;
176
+ setp.lt.s32 %p15, %r50, %r11;
177
+ @%p15 bra $L__BB0_5;
178
+ // %bb.6: // %._crit_edge.loopexit
179
+ $L__tmp1:
180
+ .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ]
181
+ add.s64 %rd68, %rd92, %rd94;
182
+ add.s64 %rd69, %rd93, %rd95;
183
+ add.s64 %rd96, %rd68, %rd69;
184
+ $L__tmp2:
185
+ $L__BB0_7: // %._crit_edge
186
+ .loc 1 23 21 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:23:21
187
+ setp.lt.s32 %p20, %r49, %r10;
188
+ $L__tmp3:
189
+ .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ]
190
+ shl.b32 %r33, %r2, 5;
191
+ mov.b32 %r34, global_smem;
192
+ add.s32 %r35, %r34, %r33;
193
+ shl.b32 %r36, %r3, 3;
194
+ add.s32 %r28, %r35, %r36;
195
+ mov.pred %p16, -1;
196
+ // begin inline asm
197
+ @%p16 st.shared.b64 [ %r28 + 0 ], %rd96;
198
+ // end inline asm
199
+ bar.sync 0;
200
+ setp.lt.u32 %p17, %r1, 128;
201
+ shl.b32 %r37, %r1, 3;
202
+ add.s32 %r29, %r34, %r37;
203
+ // begin inline asm
204
+ @%p17 ld.shared.b64 %rd71, [ %r29 + 0 ];
205
+ // end inline asm
206
+ mov.b64 {_, %r38}, %rd71;
207
+ cvt.u32.u64 %r39, %rd71;
208
+ shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
209
+ shfl.sync.bfly.b32 %r41, %r38, 2, 31, -1;
210
+ cvt.u64.u32 %rd74, %r40;
211
+ cvt.u64.u32 %rd75, %r41;
212
+ shl.b64 %rd76, %rd75, 32;
213
+ or.b64 %rd77, %rd74, %rd76;
214
+ .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ]
215
+ add.s64 %rd78, %rd71, %rd77;
216
+ .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ]
217
+ mov.b64 {_, %r42}, %rd78;
218
+ cvt.u32.u64 %r43, %rd78;
219
+ shfl.sync.bfly.b32 %r44, %r43, 1, 31, -1;
220
+ shfl.sync.bfly.b32 %r45, %r42, 1, 31, -1;
221
+ cvt.u64.u32 %rd79, %r44;
222
+ cvt.u64.u32 %rd80, %r45;
223
+ shl.b64 %rd81, %rd80, 32;
224
+ or.b64 %rd82, %rd79, %rd81;
225
+ .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ]
226
+ add.s64 %rd72, %rd78, %rd82;
227
+ .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ]
228
+ and.b32 %r46, %r1, 899;
229
+ setp.eq.b32 %p18, %r46, 0;
230
+ // begin inline asm
231
+ @%p18 st.shared.b64 [ %r29 + 0 ], %rd72;
232
+ // end inline asm
233
+ bar.sync 0;
234
+ ld.shared.b32 %r31, [%r35];
235
+ $L__tmp4:
236
+ .loc 1 43 49 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:49
237
+ setp.lt.s64 %p21, %rd20, 2;
238
+ .loc 1 43 75 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:75
239
+ setp.gt.s64 %p22, %rd20, 1;
240
+ .loc 1 43 66 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:66
241
+ selp.b64 %rd83, %rd20, 0, %p22;
242
+ .loc 1 43 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43
243
+ selp.b64 %rd84, 1, 0, %p21;
244
+ .loc 1 43 57 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:57
245
+ add.s64 %rd85, %rd83, %rd84;
246
+ .loc 1 43 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:34
247
+ mul.lo.s64 %rd86, %rd91, %rd85;
248
+ .loc 1 43 25 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:25
249
+ add.s64 %rd88, %rd19, %rd90;
250
+ shl.b64 %rd89, %rd86, 2;
251
+ add.s64 %rd73, %rd88, %rd89;
252
+ .loc 1 43 88 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:88
253
+ and.b32 %r47, %r1, 96;
254
+ setp.eq.b32 %p23, %r47, 0;
255
+ and.pred %p19, %p23, %p20;
256
+ // begin inline asm
257
+ @%p19 st.global.b32 [ %rd73 + 0 ], { %r31 };
258
+ // end inline asm
259
+ .loc 1 43 4 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:4
260
+ ret;
261
+ $L__tmp5:
262
+ $L__func_end0:
263
+ // -- End function
264
+ }
265
+ .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py"
266
+ .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py"
267
+ .section .debug_abbrev
268
+ {
269
+ .b8 1 // Abbreviation Code
270
+ .b8 17 // DW_TAG_compile_unit
271
+ .b8 1 // DW_CHILDREN_yes
272
+ .b8 37 // DW_AT_producer
273
+ .b8 8 // DW_FORM_string
274
+ .b8 19 // DW_AT_language
275
+ .b8 5 // DW_FORM_data2
276
+ .b8 3 // DW_AT_name
277
+ .b8 8 // DW_FORM_string
278
+ .b8 16 // DW_AT_stmt_list
279
+ .b8 6 // DW_FORM_data4
280
+ .b8 27 // DW_AT_comp_dir
281
+ .b8 8 // DW_FORM_string
282
+ .b8 0 // EOM(1)
283
+ .b8 0 // EOM(2)
284
+ .b8 2 // Abbreviation Code
285
+ .b8 46 // DW_TAG_subprogram
286
+ .b8 0 // DW_CHILDREN_no
287
+ .b8 3 // DW_AT_name
288
+ .b8 8 // DW_FORM_string
289
+ .b8 32 // DW_AT_inline
290
+ .b8 11 // DW_FORM_data1
291
+ .b8 0 // EOM(1)
292
+ .b8 0 // EOM(2)
293
+ .b8 3 // Abbreviation Code
294
+ .b8 46 // DW_TAG_subprogram
295
+ .b8 1 // DW_CHILDREN_yes
296
+ .b8 17 // DW_AT_low_pc
297
+ .b8 1 // DW_FORM_addr
298
+ .b8 18 // DW_AT_high_pc
299
+ .b8 1 // DW_FORM_addr
300
+ .b8 49 // DW_AT_abstract_origin
301
+ .b8 19 // DW_FORM_ref4
302
+ .b8 0 // EOM(1)
303
+ .b8 0 // EOM(2)
304
+ .b8 4 // Abbreviation Code
305
+ .b8 29 // DW_TAG_inlined_subroutine
306
+ .b8 0 // DW_CHILDREN_no
307
+ .b8 49 // DW_AT_abstract_origin
308
+ .b8 19 // DW_FORM_ref4
309
+ .b8 17 // DW_AT_low_pc
310
+ .b8 1 // DW_FORM_addr
311
+ .b8 18 // DW_AT_high_pc
312
+ .b8 1 // DW_FORM_addr
313
+ .b8 88 // DW_AT_call_file
314
+ .b8 11 // DW_FORM_data1
315
+ .b8 89 // DW_AT_call_line
316
+ .b8 11 // DW_FORM_data1
317
+ .b8 87 // DW_AT_call_column
318
+ .b8 11 // DW_FORM_data1
319
+ .b8 0 // EOM(1)
320
+ .b8 0 // EOM(2)
321
+ .b8 0 // EOM(3)
322
+ }
323
+ .section .debug_info
324
+ {
325
+ .b32 238 // Length of Unit
326
+ .b8 2 // DWARF version number
327
+ .b8 0
328
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
329
+ .b8 8 // Address Size (in bytes)
330
+ .b8 1 // Abbrev [1] 0xb:0xe7 DW_TAG_compile_unit
331
+ .b8 116 // DW_AT_producer
332
+ .b8 114
333
+ .b8 105
334
+ .b8 116
335
+ .b8 111
336
+ .b8 110
337
+ .b8 0
338
+ .b8 2 // DW_AT_language
339
+ .b8 0
340
+ .b8 99 // DW_AT_name
341
+ .b8 53
342
+ .b8 117
343
+ .b8 99
344
+ .b8 102
345
+ .b8 50
346
+ .b8 105
347
+ .b8 116
348
+ .b8 54
349
+ .b8 102
350
+ .b8 119
351
+ .b8 104
352
+ .b8 109
353
+ .b8 114
354
+ .b8 117
355
+ .b8 51
356
+ .b8 50
357
+ .b8 118
358
+ .b8 112
359
+ .b8 121
360
+ .b8 50
361
+ .b8 120
362
+ .b8 109
363
+ .b8 119
364
+ .b8 107
365
+ .b8 107
366
+ .b8 52
367
+ .b8 118
368
+ .b8 103
369
+ .b8 55
370
+ .b8 112
371
+ .b8 112
372
+ .b8 110
373
+ .b8 111
374
+ .b8 120
375
+ .b8 109
376
+ .b8 104
377
+ .b8 110
378
+ .b8 107
379
+ .b8 102
380
+ .b8 54
381
+ .b8 105
382
+ .b8 102
383
+ .b8 112
384
+ .b8 98
385
+ .b8 50
386
+ .b8 104
387
+ .b8 50
388
+ .b8 116
389
+ .b8 106
390
+ .b8 115
391
+ .b8 108
392
+ .b8 46
393
+ .b8 112
394
+ .b8 121
395
+ .b8 0
396
+ .b32 .debug_line // DW_AT_stmt_list
397
+ .b8 47 // DW_AT_comp_dir
398
+ .b8 119
399
+ .b8 111
400
+ .b8 114
401
+ .b8 107
402
+ .b8 115
403
+ .b8 112
404
+ .b8 97
405
+ .b8 99
406
+ .b8 101
407
+ .b8 47
408
+ .b8 104
409
+ .b8 97
410
+ .b8 110
411
+ .b8 114
412
+ .b8 117
413
+ .b8 105
414
+ .b8 47
415
+ .b8 83
416
+ .b8 112
417
+ .b8 101
418
+ .b8 99
419
+ .b8 70
420
+ .b8 111
421
+ .b8 114
422
+ .b8 103
423
+ .b8 101
424
+ .b8 45
425
+ .b8 101
426
+ .b8 120
427
+ .b8 116
428
+ .b8 47
429
+ .b8 99
430
+ .b8 97
431
+ .b8 99
432
+ .b8 104
433
+ .b8 101
434
+ .b8 47
435
+ .b8 99
436
+ .b8 111
437
+ .b8 109
438
+ .b8 112
439
+ .b8 105
440
+ .b8 108
441
+ .b8 101
442
+ .b8 100
443
+ .b8 95
444
+ .b8 107
445
+ .b8 101
446
+ .b8 114
447
+ .b8 110
448
+ .b8 101
449
+ .b8 108
450
+ .b8 115
451
+ .b8 47
452
+ .b8 53
453
+ .b8 117
454
+ .b8 0
455
+ .b8 2 // Abbrev [2] 0x8b:0x38 DW_TAG_subprogram
456
+ .b8 116 // DW_AT_name
457
+ .b8 114
458
+ .b8 105
459
+ .b8 116
460
+ .b8 111
461
+ .b8 110
462
+ .b8 95
463
+ .b8 114
464
+ .b8 101
465
+ .b8 100
466
+ .b8 95
467
+ .b8 102
468
+ .b8 117
469
+ .b8 115
470
+ .b8 101
471
+ .b8 100
472
+ .b8 95
473
+ .b8 95
474
+ .b8 116
475
+ .b8 111
476
+ .b8 95
477
+ .b8 99
478
+ .b8 111
479
+ .b8 112
480
+ .b8 121
481
+ .b8 95
482
+ .b8 99
483
+ .b8 108
484
+ .b8 111
485
+ .b8 110
486
+ .b8 101
487
+ .b8 95
488
+ .b8 115
489
+ .b8 108
490
+ .b8 105
491
+ .b8 99
492
+ .b8 101
493
+ .b8 95
494
+ .b8 115
495
+ .b8 117
496
+ .b8 109
497
+ .b8 95
498
+ .b8 116
499
+ .b8 114
500
+ .b8 97
501
+ .b8 110
502
+ .b8 115
503
+ .b8 112
504
+ .b8 111
505
+ .b8 115
506
+ .b8 101
507
+ .b8 95
508
+ .b8 53
509
+ .b8 0
510
+ .b8 1 // DW_AT_inline
511
+ .b8 3 // Abbrev [3] 0xc3:0x2e DW_TAG_subprogram
512
+ .b64 $L__func_begin0 // DW_AT_low_pc
513
+ .b64 $L__func_end0 // DW_AT_high_pc
514
+ .b32 139 // DW_AT_abstract_origin
515
+ .b8 4 // Abbrev [4] 0xd8:0x18 DW_TAG_inlined_subroutine
516
+ .b32 139 // DW_AT_abstract_origin
517
+ .b64 $L__tmp1 // DW_AT_low_pc
518
+ .b64 $L__tmp4 // DW_AT_high_pc
519
+ .b8 1 // DW_AT_call_file
520
+ .b8 41 // DW_AT_call_line
521
+ .b8 25 // DW_AT_call_column
522
+ .b8 0 // End Of Children Mark
523
+ .b8 0 // End Of Children Mark
524
+ }
525
+ .section .debug_macinfo { }
SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
2
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0)
3
+ #loc1 = loc(unknown)
4
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25)
5
+ #loc40 = loc("in_ptr0"(#loc))
6
+ #loc41 = loc("out_ptr1"(#loc))
7
+ #loc42 = loc("ks0"(#loc))
8
+ #loc43 = loc("ks1"(#loc))
9
+ #loc44 = loc("xnumel"(#loc))
10
+ #loc45 = loc("r0_numel"(#loc))
11
+ #loc68 = loc("tmp3"(#loc26))
12
+ #loc73 = loc(callsite(#loc1 at #loc68))
13
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
14
+ tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} {
15
+ %cst = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1)
16
+ %c0_i32 = arith.constant 0 : i32 loc(#loc1)
17
+ %c16_i32 = arith.constant 16 : i32 loc(#loc1)
18
+ %c1_i64 = arith.constant 1 : i64 loc(#loc1)
19
+ %c32_i32 = arith.constant 32 : i32 loc(#loc1)
20
+ %cst_0 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1)
21
+ %xoffset = tt.get_program_id x : i32 loc(#loc46)
22
+ %xoffset_1 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc47)
23
+ %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48)
24
+ %xindex_2 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc48)
25
+ %xindex_3 = tt.splat %xoffset_1 : i32 -> tensor<32x1xi32, #blocked> loc(#loc49)
26
+ %xindex_4 = arith.addi %xindex_3, %xindex_2 : tensor<32x1xi32, #blocked> loc(#loc49)
27
+ %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32, #blocked> loc(#loc50)
28
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<32x1xi32, #blocked> loc(#loc50)
29
+ %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc51)
30
+ %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc51)
31
+ %x0 = arith.extsi %xindex_4 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> loc(#loc52)
32
+ %x0_7 = tt.splat %ks0 : i64 -> tensor<32x1xi64, #blocked> loc(#loc52)
33
+ %x0_8 = arith.remsi %x0, %x0_7 : tensor<32x1xi64, #blocked> loc(#loc52)
34
+ %x1 = arith.divsi %x0, %x0_7 : tensor<32x1xi64, #blocked> loc(#loc53)
35
+ %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32, #blocked> loc(#loc54)
36
+ %tmp0 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked> loc(#loc55)
37
+ %tmp0_9 = tt.broadcast %x0_8 : tensor<32x1xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc56)
38
+ %tmp0_10 = arith.muli %ks0, %ks1 : i64 loc(#loc57)
39
+ %tmp0_11 = tt.splat %tmp0_10 : i64 -> tensor<32x1xi64, #blocked> loc(#loc58)
40
+ %tmp0_12 = arith.muli %tmp0_11, %x1 : tensor<32x1xi64, #blocked> loc(#loc58)
41
+ %tmp0_13 = tt.broadcast %tmp0_12 : tensor<32x1xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc59)
42
+ %tmp0_14 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>, #blocked> loc(#loc60)
43
+ %tmp0_15 = tt.broadcast %xmask_5 : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc61)
44
+ %_tmp3 = scf.for %_tmp3_17 = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_18 = %cst) -> (tensor<32x16xi64, #blocked>) : i32 {
45
+ %r0_index = tt.splat %_tmp3_17 : i32 -> tensor<1x16xi32, #blocked> loc(#loc63)
46
+ %r0_index_19 = arith.addi %r0_index, %r0_base_6 : tensor<1x16xi32, #blocked> loc(#loc63)
47
+ %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32, #blocked> loc(#loc54)
48
+ %tmp0_21 = arith.extsi %r0_index_19 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc55)
49
+ %tmp0_22 = arith.muli %tmp0, %tmp0_21 : tensor<1x16xi64, #blocked> loc(#loc55)
50
+ %tmp0_23 = tt.broadcast %tmp0_22 : tensor<1x16xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc56)
51
+ %tmp0_24 = arith.addi %tmp0_9, %tmp0_23 : tensor<32x16xi64, #blocked> loc(#loc56)
52
+ %tmp0_25 = arith.addi %tmp0_24, %tmp0_13 : tensor<32x16xi64, #blocked> loc(#loc59)
53
+ %tmp0_26 = tt.addptr %tmp0_14, %tmp0_25 : tensor<32x16x!tt.ptr<i32>, #blocked>, tensor<32x16xi64, #blocked> loc(#loc60)
54
+ %tmp0_27 = tt.broadcast %r0_mask_20 : tensor<1x16xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc61)
55
+ %tmp0_28 = arith.andi %tmp0_27, %tmp0_15 : tensor<32x16xi1, #blocked> loc(#loc61)
56
+ %tmp0_29 = tt.load %tmp0_26, %tmp0_28, %cst_0 evictionPolicy = evict_last : tensor<32x16x!tt.ptr<i32>, #blocked> loc(#loc64)
57
+ %tmp1 = arith.extsi %tmp0_29 : tensor<32x16xi32, #blocked> to tensor<32x16xi64, #blocked> loc(#loc65)
58
+ %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<32x16xi64, #blocked> loc(#loc66)
59
+ %_tmp3_30 = arith.select %tmp0_28, %tmp4, %_tmp3_18 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc67)
60
+ scf.yield %_tmp3_30 : tensor<32x16xi64, #blocked> loc(#loc24)
61
+ } loc(#loc62)
62
+ %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({
63
+ ^bb0(%tmp3_17: i64 loc(callsite(#loc1 at #loc68)), %tmp3_18: i64 loc(callsite(#loc1 at #loc68))):
64
+ %tmp3_19 = arith.addi %tmp3_17, %tmp3_18 : i64 loc(#loc74)
65
+ tt.reduce.return %tmp3_19 : i64 loc(#loc72)
66
+ }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc72)
67
+ %tmp3_16 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc69)
68
+ %tmp5 = arith.trunci %tmp3_16 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc70)
69
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc30)
70
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc31)
71
+ %2 = arith.extui %1 : i1 to i64 loc(#loc32)
72
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc32)
73
+ %4 = arith.extui %0 : i1 to i64 loc(#loc71)
74
+ %5 = arith.addi %4, %3 : i64 loc(#loc33)
75
+ %6 = tt.splat %5 : i64 -> tensor<32x1xi64, #blocked> loc(#loc35)
76
+ %7 = arith.muli %x1, %6 : tensor<32x1xi64, #blocked> loc(#loc35)
77
+ %8 = arith.addi %x0_8, %7 : tensor<32x1xi64, #blocked> loc(#loc36)
78
+ %9 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>, #blocked> loc(#loc37)
79
+ %10 = tt.addptr %9, %8 : tensor<32x1x!tt.ptr<i32>, #blocked>, tensor<32x1xi64, #blocked> loc(#loc37)
80
+ tt.store %10, %tmp5, %xmask_5 : tensor<32x1x!tt.ptr<i32>, #blocked> loc(#loc38)
81
+ tt.return loc(#loc39)
82
+ } loc(#loc)
83
+ } loc(#loc)
84
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28)
85
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33)
86
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44)
87
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23)
88
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21)
89
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37)
90
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19)
91
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19)
92
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29)
93
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43)
94
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39)
95
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54)
96
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58)
97
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50)
98
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34)
99
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73)
100
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40)
101
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31)
102
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63)
103
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23)
104
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23)
105
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48)
106
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8)
107
+ #loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
108
+ #loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
109
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28)
110
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19)
111
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49)
112
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75)
113
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66)
114
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57)
115
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41)
116
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34)
117
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30)
118
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25)
119
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88)
120
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4)
121
+ #loc46 = loc("xoffset"(#loc2))
122
+ #loc47 = loc("xoffset"(#loc3))
123
+ #loc48 = loc("xindex"(#loc4))
124
+ #loc49 = loc("xindex"(#loc5))
125
+ #loc50 = loc("xmask"(#loc6))
126
+ #loc51 = loc("r0_base"(#loc7))
127
+ #loc52 = loc("x0"(#loc8))
128
+ #loc53 = loc("x1"(#loc9))
129
+ #loc54 = loc("r0_mask"(#loc10))
130
+ #loc55 = loc("tmp0"(#loc11))
131
+ #loc56 = loc("tmp0"(#loc12))
132
+ #loc57 = loc("tmp0"(#loc13))
133
+ #loc58 = loc("tmp0"(#loc14))
134
+ #loc59 = loc("tmp0"(#loc15))
135
+ #loc60 = loc("tmp0"(#loc16))
136
+ #loc61 = loc("tmp0"(#loc17))
137
+ #loc62 = loc("_tmp3"(#loc18))
138
+ #loc63 = loc("r0_index"(#loc19))
139
+ #loc64 = loc("tmp0"(#loc20))
140
+ #loc65 = loc("tmp1"(#loc21))
141
+ #loc66 = loc("tmp4"(#loc22))
142
+ #loc67 = loc("_tmp3"(#loc23))
143
+ #loc69 = loc("tmp3"(#loc28))
144
+ #loc70 = loc("tmp5"(#loc29))
145
+ #loc71 = loc(fused[#loc33, #loc34])
146
+ #loc72 = loc(callsite(#loc25 at #loc68))
147
+ #loc74 = loc(callsite(#loc27 at #loc72))
SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25)
4
+ #loc43 = loc("in_ptr0"(#loc))
5
+ #loc44 = loc("out_ptr1"(#loc))
6
+ #loc45 = loc("ks0"(#loc))
7
+ #loc46 = loc("ks1"(#loc))
8
+ #loc47 = loc("xnumel"(#loc))
9
+ #loc48 = loc("r0_numel"(#loc))
10
+ #loc74 = loc("tmp3"(#loc29))
11
+ #loc79 = loc(callsite(#loc1 at #loc74))
12
+ module {
13
+ tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} {
14
+ %c1_i64 = arith.constant 1 : i64 loc(#loc1)
15
+ %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1)
16
+ %c16_i32 = arith.constant 16 : i32 loc(#loc2)
17
+ %c0_i32 = arith.constant 0 : i32 loc(#loc2)
18
+ %_tmp3 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc49)
19
+ %c32_i32 = arith.constant 32 : i32 loc(#loc1)
20
+ %xoffset = tt.get_program_id x : i32 loc(#loc50)
21
+ %xoffset_0 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc51)
22
+ %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc52)
23
+ %xindex_1 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc53)
24
+ %xindex_2 = tt.splat %xoffset_0 : i32 -> tensor<32x1xi32> loc(#loc54)
25
+ %xindex_3 = arith.addi %xindex_2, %xindex_1 : tensor<32x1xi32> loc(#loc54)
26
+ %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32> loc(#loc55)
27
+ %xmask_4 = arith.cmpi slt, %xindex_3, %xmask : tensor<32x1xi32> loc(#loc55)
28
+ %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc56)
29
+ %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc57)
30
+ %x0 = arith.extsi %xindex_3 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc58)
31
+ %x0_6 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc58)
32
+ %x0_7 = arith.remsi %x0, %x0_6 : tensor<32x1xi64> loc(#loc58)
33
+ %x1 = arith.divsi %x0, %x0_6 : tensor<32x1xi64> loc(#loc59)
34
+ %_tmp3_8 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_10 = %_tmp3) -> (tensor<32x16xi64>) : i32 {
35
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc61)
36
+ %r0_index_11 = arith.addi %r0_index, %r0_base_5 : tensor<1x16xi32> loc(#loc61)
37
+ %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc62)
38
+ %r0_mask_12 = arith.cmpi slt, %r0_index_11, %r0_mask : tensor<1x16xi32> loc(#loc62)
39
+ %tmp0 = arith.extsi %r0_index_11 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc63)
40
+ %tmp0_13 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc63)
41
+ %tmp0_14 = arith.muli %tmp0_13, %tmp0 : tensor<1x16xi64> loc(#loc63)
42
+ %tmp0_15 = tt.broadcast %x0_7 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc64)
43
+ %tmp0_16 = tt.broadcast %tmp0_14 : tensor<1x16xi64> -> tensor<32x16xi64> loc(#loc64)
44
+ %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<32x16xi64> loc(#loc64)
45
+ %tmp0_18 = arith.muli %ks0, %ks1 : i64 loc(#loc65)
46
+ %tmp0_19 = tt.splat %tmp0_18 : i64 -> tensor<32x1xi64> loc(#loc66)
47
+ %tmp0_20 = arith.muli %tmp0_19, %x1 : tensor<32x1xi64> loc(#loc66)
48
+ %tmp0_21 = tt.broadcast %tmp0_20 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc67)
49
+ %tmp0_22 = arith.addi %tmp0_17, %tmp0_21 : tensor<32x16xi64> loc(#loc67)
50
+ %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>> loc(#loc68)
51
+ %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<32x16x!tt.ptr<i32>>, tensor<32x16xi64> loc(#loc68)
52
+ %tmp0_25 = tt.broadcast %r0_mask_12 : tensor<1x16xi1> -> tensor<32x16xi1> loc(#loc69)
53
+ %tmp0_26 = tt.broadcast %xmask_4 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc69)
54
+ %tmp0_27 = arith.andi %tmp0_25, %tmp0_26 : tensor<32x16xi1> loc(#loc69)
55
+ %tmp0_28 = tt.load %tmp0_24, %tmp0_27, %cst evictionPolicy = evict_last : tensor<32x16x!tt.ptr<i32>> loc(#loc70)
56
+ %tmp1 = arith.extsi %tmp0_28 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc71)
57
+ %tmp4 = arith.addi %_tmp3_10, %tmp1 : tensor<32x16xi64> loc(#loc72)
58
+ %_tmp3_29 = arith.select %tmp0_27, %tmp4, %_tmp3_10 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc73)
59
+ scf.yield %_tmp3_29 : tensor<32x16xi64> loc(#loc27)
60
+ } loc(#loc60)
61
+ %tmp3 = "tt.reduce"(%_tmp3_8) <{axis = 1 : i32}> ({
62
+ ^bb0(%tmp3_10: i64 loc(callsite(#loc1 at #loc74)), %tmp3_11: i64 loc(callsite(#loc1 at #loc74))):
63
+ %tmp3_12 = arith.addi %tmp3_10, %tmp3_11 : i64 loc(#loc80)
64
+ tt.reduce.return %tmp3_12 : i64 loc(#loc78)
65
+ }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc78)
66
+ %tmp3_9 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc75)
67
+ %tmp5 = arith.trunci %tmp3_9 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc76)
68
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc33)
69
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc34)
70
+ %2 = arith.extui %1 : i1 to i64 loc(#loc35)
71
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc35)
72
+ %4 = arith.extui %0 : i1 to i64 loc(#loc77)
73
+ %5 = arith.addi %4, %3 : i64 loc(#loc36)
74
+ %6 = tt.splat %5 : i64 -> tensor<32x1xi64> loc(#loc38)
75
+ %7 = arith.muli %x1, %6 : tensor<32x1xi64> loc(#loc38)
76
+ %8 = arith.addi %x0_7, %7 : tensor<32x1xi64> loc(#loc39)
77
+ %9 = tt.splat %out_ptr1 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>> loc(#loc40)
78
+ %10 = tt.addptr %9, %8 : tensor<32x1x!tt.ptr<i32>>, tensor<32x1xi64> loc(#loc40)
79
+ tt.store %10, %tmp5, %xmask_4 : tensor<32x1x!tt.ptr<i32>> loc(#loc41)
80
+ tt.return loc(#loc42)
81
+ } loc(#loc)
82
+ } loc(#loc)
83
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40)
84
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":28:43)
85
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28)
86
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33)
87
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:36)
88
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44)
89
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23)
90
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21)
91
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:27)
92
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37)
93
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19)
94
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19)
95
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31)
96
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29)
97
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43)
98
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39)
99
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54)
100
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58)
101
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50)
102
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34)
103
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73)
104
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63)
105
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23)
106
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23)
107
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48)
108
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8)
109
+ #loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
110
+ #loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
111
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28)
112
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19)
113
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49)
114
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75)
115
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66)
116
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57)
117
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41)
118
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34)
119
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30)
120
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25)
121
+ #loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88)
122
+ #loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4)
123
+ #loc49 = loc("_tmp3"(#loc3))
124
+ #loc50 = loc("xoffset"(#loc4))
125
+ #loc51 = loc("xoffset"(#loc5))
126
+ #loc52 = loc("xindex"(#loc6))
127
+ #loc53 = loc("xindex"(#loc7))
128
+ #loc54 = loc("xindex"(#loc8))
129
+ #loc55 = loc("xmask"(#loc9))
130
+ #loc56 = loc("r0_base"(#loc10))
131
+ #loc57 = loc("r0_base"(#loc11))
132
+ #loc58 = loc("x0"(#loc12))
133
+ #loc59 = loc("x1"(#loc13))
134
+ #loc60 = loc("_tmp3"(#loc2))
135
+ #loc61 = loc("r0_index"(#loc14))
136
+ #loc62 = loc("r0_mask"(#loc15))
137
+ #loc63 = loc("tmp0"(#loc16))
138
+ #loc64 = loc("tmp0"(#loc17))
139
+ #loc65 = loc("tmp0"(#loc18))
140
+ #loc66 = loc("tmp0"(#loc19))
141
+ #loc67 = loc("tmp0"(#loc20))
142
+ #loc68 = loc("tmp0"(#loc21))
143
+ #loc69 = loc("tmp0"(#loc22))
144
+ #loc70 = loc("tmp0"(#loc23))
145
+ #loc71 = loc("tmp1"(#loc24))
146
+ #loc72 = loc("tmp4"(#loc25))
147
+ #loc73 = loc("_tmp3"(#loc26))
148
+ #loc75 = loc("tmp3"(#loc31))
149
+ #loc76 = loc("tmp5"(#loc32))
150
+ #loc77 = loc(fused[#loc36, #loc37])
151
+ #loc78 = loc(callsite(#loc28 at #loc74))
152
+ #loc80 = loc(callsite(#loc30 at #loc78))
SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}}
SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin ADDED
Binary file (86.4 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "e50c6a987142dbb9ad0869ff13cb48e2cad8442b31ebedd676f284f006fc5f0a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"}
SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir ADDED
The diff for this file is too large to render. See raw diff
 
SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx ADDED
The diff for this file is too large to render. See raw diff
 
SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source ADDED
The diff for this file is too large to render. See raw diff
 
SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir ADDED
@@ -0,0 +1,841 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
2
+ #blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
3
+ #linear = #ttg.linear<{register = [[0, 4], [0, 8]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 1], [0, 2]], block = []}>
4
+ #linear1 = #ttg.linear<{register = [[2, 0, 0], [4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0], [128, 0, 0]], warp = [[0, 1, 0], [1, 0, 0]], block = []}>
5
+ #linear2 = #ttg.linear<{register = [[1, 0, 0], [2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0]], warp = [[0, 0, 1], [0, 1, 0]], block = []}>
6
+ #linear3 = #ttg.linear<{register = [[0, 1, 0], [1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}>
7
+ #linear4 = #ttg.linear<{register = [[0, 0, 4], [0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}>
8
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":18:0)
9
+ #loc1 = loc(unknown)
10
+ #loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12)
11
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":41:67)
12
+ #loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73)
13
+ #loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51)
14
+ #loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53)
15
+ #loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50)
16
+ #loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51)
17
+ #loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:26)
18
+ #loc77 = loc("in_ptr0"(#loc))
19
+ #loc78 = loc("out_ptr2"(#loc))
20
+ #loc79 = loc("out_ptr3"(#loc))
21
+ #loc80 = loc("xnumel"(#loc))
22
+ #loc81 = loc("r0_numel"(#loc))
23
+ #loc99 = loc(callsite(#loc19 at #loc20))
24
+ #loc105 = loc("ileft"(#loc28))
25
+ #loc109 = loc("iright"(#loc33))
26
+ #loc118 = loc("left_idx"(#loc42))
27
+ #loc123 = loc("right_idx"(#loc47))
28
+ #loc143 = loc("tmp11"(#loc67))
29
+ #loc149 = loc(callsite(#loc24 at #loc99))
30
+ #loc153 = loc(callsite(#loc1 at #loc143))
31
+ #loc157 = loc(callsite(#loc105 at #loc149))
32
+ #loc161 = loc(callsite(#loc109 at #loc149))
33
+ #loc169 = loc(callsite(#loc118 at #loc149))
34
+ #loc174 = loc(callsite(#loc123 at #loc149))
35
+ #loc194 = loc(callsite(#loc1 at #loc157))
36
+ #loc196 = loc(callsite(#loc1 at #loc161))
37
+ #loc199 = loc(callsite(#loc1 at #loc169))
38
+ #loc202 = loc(callsite(#loc1 at #loc174))
39
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
40
+ tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
41
+ %cst = arith.constant dense<0> : tensor<32x16xi32, #linear> loc(#loc1)
42
+ %cst_0 = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1)
43
+ %c32_i32 = arith.constant 32 : i32 loc(#loc1)
44
+ %cst_1 = arith.constant dense<128> : tensor<32x1xi32, #blocked> loc(#loc1)
45
+ %cst_2 = arith.constant dense<128> : tensor<32x1xi32, #blocked1> loc(#loc1)
46
+ %cst_3 = arith.constant dense<16> : tensor<32x1xi32, #blocked> loc(#loc1)
47
+ %cst_4 = arith.constant dense<16> : tensor<32x1xi32, #blocked1> loc(#loc1)
48
+ %cst_5 = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1)
49
+ %cst_6 = arith.constant dense<272> : tensor<32x1xi32, #blocked> loc(#loc1)
50
+ %cst_7 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1)
51
+ %cst_8 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1)
52
+ %cst_9 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1)
53
+ %cst_10 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1)
54
+ %cst_11 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1)
55
+ %xoffset = tt.get_program_id x : i32 loc(#loc82)
56
+ %xoffset_12 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc83)
57
+ %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc84)
58
+ %xindex_13 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc84)
59
+ %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc84)
60
+ %xindex_15 = tt.expand_dims %xindex_13 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> loc(#loc84)
61
+ %xindex_16 = tt.splat %xoffset_12 : i32 -> tensor<32x1xi32, #blocked> loc(#loc85)
62
+ %xindex_17 = tt.splat %xoffset_12 : i32 -> tensor<32x1xi32, #blocked1> loc(#loc85)
63
+ %xindex_18 = arith.addi %xindex_16, %xindex_14 : tensor<32x1xi32, #blocked> loc(#loc85)
64
+ %xindex_19 = arith.addi %xindex_17, %xindex_15 : tensor<32x1xi32, #blocked1> loc(#loc85)
65
+ %xmask = arith.cmpi slt, %xindex_18, %cst_1 : tensor<32x1xi32, #blocked> loc(#loc86)
66
+ %xmask_20 = arith.cmpi slt, %xindex_19, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc86)
67
+ %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc87)
68
+ %r0_index_21 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc87)
69
+ %r0_index_22 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc87)
70
+ %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc87)
71
+ %r0_index_24 = tt.expand_dims %r0_index_21 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc87)
72
+ %r0_index_25 = tt.expand_dims %r0_index_22 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc87)
73
+ %x0 = arith.remsi %xindex_18, %cst_3 : tensor<32x1xi32, #blocked> loc(#loc88)
74
+ %x1 = arith.divsi %xindex_18, %cst_3 : tensor<32x1xi32, #blocked> loc(#loc89)
75
+ %tmp0 = arith.muli %r0_index_23, %cst_5 : tensor<1x16xi32, #blocked> loc(#loc90)
76
+ %tmp0_26 = tt.broadcast %x0 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc91)
77
+ %tmp0_27 = tt.broadcast %tmp0 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc91)
78
+ %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<32x16xi32, #blocked> loc(#loc91)
79
+ %tmp0_29 = arith.muli %x1, %cst_6 : tensor<32x1xi32, #blocked> loc(#loc92)
80
+ %tmp0_30 = tt.broadcast %tmp0_29 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc93)
81
+ %tmp0_31 = arith.addi %tmp0_28, %tmp0_30 : tensor<32x16xi32, #blocked> loc(#loc93)
82
+ %tmp0_32 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>, #blocked> loc(#loc94)
83
+ %tmp0_33 = tt.addptr %tmp0_32, %tmp0_31 : tensor<32x16x!tt.ptr<i32>, #blocked>, tensor<32x16xi32, #blocked> loc(#loc94)
84
+ %tmp0_34 = tt.broadcast %xmask : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc95)
85
+ %tmp0_35 = tt.broadcast %xmask_20 : tensor<32x1xi1, #blocked1> -> tensor<32x16xi1, #blocked1> loc(#loc95)
86
+ %tmp0_36 = tt.load %tmp0_33, %tmp0_34, %cst_11 : tensor<32x16x!tt.ptr<i32>, #blocked> loc(#loc95)
87
+ %tmp2 = arith.trunci %r0_index_24 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc96)
88
+ %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<32x16xi16, #linear> loc(#loc97)
89
+ %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc146)
90
+ %flip_37 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc146)
91
+ %flip_38 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc146)
92
+ %flip_39 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc146)
93
+ %flip_40 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc146)
94
+ %flip_41 = tt.expand_dims %flip_37 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc146)
95
+ %flip_42 = tt.expand_dims %flip_38 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc146)
96
+ %flip_43 = tt.expand_dims %flip_39 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc146)
97
+ %flip_44 = tt.expand_dims %flip_40 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc146)
98
+ %flip_45 = tt.expand_dims %flip_41 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc146)
99
+ %flip_46 = tt.expand_dims %flip_42 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc146)
100
+ %flip_47 = tt.expand_dims %flip_43 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc146)
101
+ %flip_48 = tt.broadcast %flip_44 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc147)
102
+ %flip_49 = tt.reshape %flip_48 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #blocked> loc(#loc148)
103
+ %flip_50 = tt.reshape %flip_48 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc148)
104
+ %y = tt.reshape %tmp0_36 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #linear1> loc(#loc154)
105
+ %left_mask = arith.subi %cst_7, %flip_45 : tensor<1x2x1xi32, #linear1> loc(#loc155)
106
+ %left_mask_51 = arith.subi %cst_8, %flip_44 : tensor<1x2x1xi32, #linear2> loc(#loc155)
107
+ %left_mask_52 = arith.subi %cst_9, %flip_46 : tensor<1x2x1xi32, #linear3> loc(#loc155)
108
+ %left_mask_53 = arith.subi %cst_10, %flip_47 : tensor<1x2x1xi32, #linear4> loc(#loc155)
109
+ %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc156)
110
+ %ileft_54 = arith.muli %y, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156)
111
+ %ileft_55 = "tt.reduce"(%ileft_54) <{axis = 1 : i32}> ({
112
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
113
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
114
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
115
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
116
+ %ileft_56 = tt.expand_dims %ileft_55 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158)
117
+ %ileft_57 = tt.broadcast %ileft_56 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159)
118
+ %iright = tt.broadcast %flip_45 : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc160)
119
+ %iright_58 = arith.muli %y, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160)
120
+ %iright_59 = "tt.reduce"(%iright_58) <{axis = 1 : i32}> ({
121
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
122
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
123
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
124
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
125
+ %iright_60 = tt.expand_dims %iright_59 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162)
126
+ %iright_61 = tt.broadcast %iright_60 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163)
127
+ %ileft_62 = tt.reshape %ileft_57 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc164)
128
+ %ileft_63 = tt.reshape %ileft_57 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164)
129
+ %iright_64 = tt.reshape %iright_61 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc165)
130
+ %iright_65 = tt.reshape %iright_61 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165)
131
+ %y_idx = tt.reshape %tmp4 : tensor<32x16xi16, #linear> -> tensor<256x2x1xi16, #linear1> loc(#loc166)
132
+ %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc167)
133
+ %left_idx_66 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc168)
134
+ %left_idx_67 = arith.muli %y_idx, %left_idx_66 : tensor<256x2x1xi16, #linear1> loc(#loc168)
135
+ %input = arith.extsi %left_idx_67 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc197)
136
+ %left_idx_68 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
137
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
138
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
139
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
140
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
141
+ %left_idx_69 = tt.expand_dims %left_idx_68 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170)
142
+ %left_idx_70 = tt.broadcast %left_idx_69 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171)
143
+ %right_idx = arith.trunci %flip_45 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc172)
144
+ %right_idx_71 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc173)
145
+ %right_idx_72 = arith.muli %y_idx, %right_idx_71 : tensor<256x2x1xi16, #linear1> loc(#loc173)
146
+ %input_73 = arith.extsi %right_idx_72 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc200)
147
+ %right_idx_74 = "tt.reduce"(%input_73) <{axis = 1 : i32}> ({
148
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
149
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
150
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
151
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
152
+ %right_idx_75 = tt.expand_dims %right_idx_74 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175)
153
+ %right_idx_76 = tt.broadcast %right_idx_75 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176)
154
+ %left_idx_77 = tt.reshape %left_idx_70 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc177)
155
+ %left_idx_78 = tt.reshape %left_idx_70 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177)
156
+ %right_idx_79 = tt.reshape %right_idx_76 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc178)
157
+ %right_idx_80 = tt.reshape %right_idx_76 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178)
158
+ %cond = arith.cmpi slt, %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc179)
159
+ %cond_81 = arith.cmpi slt, %ileft_63, %iright_65 : tensor<32x16xi32, #linear> loc(#loc179)
160
+ %eq = arith.cmpi eq, %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc180)
161
+ %eq_82 = arith.cmpi eq, %ileft_63, %iright_65 : tensor<32x16xi32, #linear> loc(#loc180)
162
+ %cond_83 = arith.cmpi sgt, %left_idx_77, %right_idx_79 : tensor<32x16xi32, #blocked> loc(#loc181)
163
+ %cond_84 = arith.cmpi sgt, %left_idx_78, %right_idx_80 : tensor<32x16xi32, #linear> loc(#loc181)
164
+ %cond_85 = arith.andi %eq, %cond_83 : tensor<32x16xi1, #blocked> loc(#loc182)
165
+ %cond_86 = arith.andi %eq_82, %cond_84 : tensor<32x16xi1, #linear> loc(#loc182)
166
+ %cond_87 = arith.ori %cond, %cond_85 : tensor<32x16xi1, #blocked> loc(#loc183)
167
+ %cond_88 = arith.ori %cond_81, %cond_86 : tensor<32x16xi1, #linear> loc(#loc183)
168
+ %cond_89 = arith.extui %cond_87 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc184)
169
+ %cond_90 = arith.extui %cond_88 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
170
+ %cond_91 = arith.xori %cond_89, %flip_49 : tensor<32x16xi32, #blocked> loc(#loc184)
171
+ %cond_92 = arith.xori %cond_90, %flip_50 : tensor<32x16xi32, #linear> loc(#loc184)
172
+ %cond_93 = arith.cmpi ne, %cond_91, %cst_11 : tensor<32x16xi32, #blocked> loc(#loc185)
173
+ %cond_94 = arith.cmpi ne, %cond_92, %cst : tensor<32x16xi32, #linear> loc(#loc185)
174
+ %ret = arith.xori %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc186)
175
+ %ret_95 = arith.select %cond_93, %ret, %cst_11 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc187)
176
+ %ret_96 = arith.xori %tmp0_36, %ret_95 : tensor<32x16xi32, #blocked> loc(#loc188)
177
+ %ret_97 = ttg.convert_layout %ret_96 : tensor<32x16xi32, #blocked> -> tensor<32x16xi32, #linear> loc(#loc188)
178
+ %new_idxs = arith.xori %left_idx_78, %right_idx_80 : tensor<32x16xi32, #linear> loc(#loc189)
179
+ %new_idxs_98 = arith.select %cond_94, %new_idxs, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
180
+ %new_idxs_99 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc191)
181
+ %new_idxs_100 = tt.broadcast %new_idxs_99 : tensor<1x16xi32, #linear> -> tensor<32x16xi32, #linear> loc(#loc191)
182
+ %new_idxs_101 = arith.xori %new_idxs_100, %new_idxs_98 : tensor<32x16xi32, #linear> loc(#loc191)
183
+ %flip_102 = tt.broadcast %flip_46 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc147)
184
+ %flip_103 = tt.reshape %flip_102 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc148)
185
+ %y_104 = tt.reshape %ret_96 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #linear2> loc(#loc154)
186
+ %ileft_105 = tt.broadcast %left_mask_51 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc156)
187
+ %ileft_106 = arith.muli %y_104, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156)
188
+ %ileft_107 = "tt.reduce"(%ileft_106) <{axis = 1 : i32}> ({
189
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
190
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
191
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
192
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
193
+ %ileft_108 = tt.expand_dims %ileft_107 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158)
194
+ %ileft_109 = tt.broadcast %ileft_108 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159)
195
+ %iright_110 = arith.muli %y_104, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160)
196
+ %iright_111 = "tt.reduce"(%iright_110) <{axis = 1 : i32}> ({
197
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
198
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
199
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
200
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
201
+ %iright_112 = tt.expand_dims %iright_111 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162)
202
+ %iright_113 = tt.broadcast %iright_112 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163)
203
+ %ileft_114 = tt.reshape %ileft_109 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164)
204
+ %iright_115 = tt.reshape %iright_113 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165)
205
+ %y_idx_116 = tt.reshape %new_idxs_101 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166)
206
+ %left_idx_117 = arith.muli %y_idx_116, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168)
207
+ %left_idx_118 = "tt.reduce"(%left_idx_117) <{axis = 1 : i32}> ({
208
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
209
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
210
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
211
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
212
+ %left_idx_119 = tt.expand_dims %left_idx_118 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170)
213
+ %left_idx_120 = tt.broadcast %left_idx_119 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171)
214
+ %right_idx_121 = arith.muli %y_idx_116, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173)
215
+ %right_idx_122 = "tt.reduce"(%right_idx_121) <{axis = 1 : i32}> ({
216
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
217
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
218
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
219
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
220
+ %right_idx_123 = tt.expand_dims %right_idx_122 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175)
221
+ %right_idx_124 = tt.broadcast %right_idx_123 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176)
222
+ %left_idx_125 = tt.reshape %left_idx_120 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177)
223
+ %right_idx_126 = tt.reshape %right_idx_124 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178)
224
+ %cond_127 = arith.cmpi slt, %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc179)
225
+ %eq_128 = arith.cmpi eq, %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc180)
226
+ %cond_129 = arith.cmpi sgt, %left_idx_125, %right_idx_126 : tensor<32x16xi32, #linear> loc(#loc181)
227
+ %cond_130 = arith.andi %eq_128, %cond_129 : tensor<32x16xi1, #linear> loc(#loc182)
228
+ %cond_131 = arith.ori %cond_127, %cond_130 : tensor<32x16xi1, #linear> loc(#loc183)
229
+ %cond_132 = arith.extui %cond_131 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
230
+ %cond_133 = arith.xori %cond_132, %flip_103 : tensor<32x16xi32, #linear> loc(#loc184)
231
+ %cond_134 = arith.cmpi ne, %cond_133, %cst : tensor<32x16xi32, #linear> loc(#loc185)
232
+ %ret_135 = arith.xori %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc186)
233
+ %ret_136 = arith.select %cond_134, %ret_135, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
234
+ %ret_137 = arith.xori %ret_97, %ret_136 : tensor<32x16xi32, #linear> loc(#loc188)
235
+ %new_idxs_138 = arith.xori %left_idx_125, %right_idx_126 : tensor<32x16xi32, #linear> loc(#loc189)
236
+ %new_idxs_139 = arith.select %cond_134, %new_idxs_138, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
237
+ %new_idxs_140 = arith.xori %new_idxs_101, %new_idxs_139 : tensor<32x16xi32, #linear> loc(#loc191)
238
+ %y_141 = tt.reshape %ret_137 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154)
239
+ %ileft_142 = arith.muli %y_141, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156)
240
+ %ileft_143 = "tt.reduce"(%ileft_142) <{axis = 1 : i32}> ({
241
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
242
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
243
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
244
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
245
+ %ileft_144 = tt.expand_dims %ileft_143 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158)
246
+ %ileft_145 = tt.broadcast %ileft_144 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159)
247
+ %iright_146 = arith.muli %y_141, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160)
248
+ %iright_147 = "tt.reduce"(%iright_146) <{axis = 1 : i32}> ({
249
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
250
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
251
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
252
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
253
+ %iright_148 = tt.expand_dims %iright_147 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162)
254
+ %iright_149 = tt.broadcast %iright_148 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163)
255
+ %ileft_150 = tt.reshape %ileft_145 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164)
256
+ %iright_151 = tt.reshape %iright_149 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165)
257
+ %y_idx_152 = tt.reshape %new_idxs_140 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166)
258
+ %left_idx_153 = arith.muli %y_idx_152, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168)
259
+ %left_idx_154 = "tt.reduce"(%left_idx_153) <{axis = 1 : i32}> ({
260
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
261
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
262
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
263
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
264
+ %left_idx_155 = tt.expand_dims %left_idx_154 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170)
265
+ %left_idx_156 = tt.broadcast %left_idx_155 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171)
266
+ %right_idx_157 = arith.muli %y_idx_152, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173)
267
+ %right_idx_158 = "tt.reduce"(%right_idx_157) <{axis = 1 : i32}> ({
268
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
269
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
270
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
271
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
272
+ %right_idx_159 = tt.expand_dims %right_idx_158 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175)
273
+ %right_idx_160 = tt.broadcast %right_idx_159 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176)
274
+ %left_idx_161 = tt.reshape %left_idx_156 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177)
275
+ %right_idx_162 = tt.reshape %right_idx_160 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178)
276
+ %cond_163 = arith.cmpi slt, %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc179)
277
+ %eq_164 = arith.cmpi eq, %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc180)
278
+ %cond_165 = arith.cmpi sgt, %left_idx_161, %right_idx_162 : tensor<32x16xi32, #linear> loc(#loc181)
279
+ %cond_166 = arith.andi %eq_164, %cond_165 : tensor<32x16xi1, #linear> loc(#loc182)
280
+ %cond_167 = arith.ori %cond_163, %cond_166 : tensor<32x16xi1, #linear> loc(#loc183)
281
+ %cond_168 = arith.extui %cond_167 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
282
+ %cond_169 = arith.xori %cond_168, %flip_103 : tensor<32x16xi32, #linear> loc(#loc184)
283
+ %cond_170 = arith.cmpi ne, %cond_169, %cst : tensor<32x16xi32, #linear> loc(#loc185)
284
+ %ret_171 = arith.xori %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc186)
285
+ %ret_172 = arith.select %cond_170, %ret_171, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
286
+ %ret_173 = arith.xori %ret_137, %ret_172 : tensor<32x16xi32, #linear> loc(#loc188)
287
+ %new_idxs_174 = arith.xori %left_idx_161, %right_idx_162 : tensor<32x16xi32, #linear> loc(#loc189)
288
+ %new_idxs_175 = arith.select %cond_170, %new_idxs_174, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
289
+ %new_idxs_176 = arith.xori %new_idxs_140, %new_idxs_175 : tensor<32x16xi32, #linear> loc(#loc191)
290
+ %flip_177 = tt.broadcast %flip_47 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc147)
291
+ %flip_178 = tt.reshape %flip_177 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc148)
292
+ %y_179 = tt.reshape %ret_173 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc154)
293
+ %ileft_180 = tt.broadcast %left_mask_52 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc156)
294
+ %ileft_181 = arith.muli %y_179, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc156)
295
+ %ileft_182 = "tt.reduce"(%ileft_181) <{axis = 1 : i32}> ({
296
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
297
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
298
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
299
+ }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193)
300
+ %ileft_183 = tt.expand_dims %ileft_182 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc158)
301
+ %ileft_184 = tt.broadcast %ileft_183 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc159)
302
+ %iright_185 = arith.muli %y_179, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc160)
303
+ %iright_186 = "tt.reduce"(%iright_185) <{axis = 1 : i32}> ({
304
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
305
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
306
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
307
+ }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195)
308
+ %iright_187 = tt.expand_dims %iright_186 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc162)
309
+ %iright_188 = tt.broadcast %iright_187 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc163)
310
+ %ileft_189 = tt.reshape %ileft_184 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc164)
311
+ %iright_190 = tt.reshape %iright_188 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc165)
312
+ %y_idx_191 = tt.reshape %new_idxs_176 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc166)
313
+ %left_idx_192 = arith.muli %y_idx_191, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc168)
314
+ %left_idx_193 = "tt.reduce"(%left_idx_192) <{axis = 1 : i32}> ({
315
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
316
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
317
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
318
+ }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198)
319
+ %left_idx_194 = tt.expand_dims %left_idx_193 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc170)
320
+ %left_idx_195 = tt.broadcast %left_idx_194 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc171)
321
+ %right_idx_196 = arith.muli %y_idx_191, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc173)
322
+ %right_idx_197 = "tt.reduce"(%right_idx_196) <{axis = 1 : i32}> ({
323
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
324
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
325
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
326
+ }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201)
327
+ %right_idx_198 = tt.expand_dims %right_idx_197 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc175)
328
+ %right_idx_199 = tt.broadcast %right_idx_198 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc176)
329
+ %left_idx_200 = tt.reshape %left_idx_195 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc177)
330
+ %right_idx_201 = tt.reshape %right_idx_199 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc178)
331
+ %cond_202 = arith.cmpi slt, %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc179)
332
+ %eq_203 = arith.cmpi eq, %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc180)
333
+ %cond_204 = arith.cmpi sgt, %left_idx_200, %right_idx_201 : tensor<32x16xi32, #linear> loc(#loc181)
334
+ %cond_205 = arith.andi %eq_203, %cond_204 : tensor<32x16xi1, #linear> loc(#loc182)
335
+ %cond_206 = arith.ori %cond_202, %cond_205 : tensor<32x16xi1, #linear> loc(#loc183)
336
+ %cond_207 = arith.extui %cond_206 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
337
+ %cond_208 = arith.xori %cond_207, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184)
338
+ %cond_209 = arith.cmpi ne, %cond_208, %cst : tensor<32x16xi32, #linear> loc(#loc185)
339
+ %ret_210 = arith.xori %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc186)
340
+ %ret_211 = arith.select %cond_209, %ret_210, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
341
+ %ret_212 = arith.xori %ret_173, %ret_211 : tensor<32x16xi32, #linear> loc(#loc188)
342
+ %new_idxs_213 = arith.xori %left_idx_200, %right_idx_201 : tensor<32x16xi32, #linear> loc(#loc189)
343
+ %new_idxs_214 = arith.select %cond_209, %new_idxs_213, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
344
+ %new_idxs_215 = arith.xori %new_idxs_176, %new_idxs_214 : tensor<32x16xi32, #linear> loc(#loc191)
345
+ %y_216 = tt.reshape %ret_212 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc154)
346
+ %ileft_217 = arith.muli %y_216, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156)
347
+ %ileft_218 = "tt.reduce"(%ileft_217) <{axis = 1 : i32}> ({
348
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
349
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
350
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
351
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
352
+ %ileft_219 = tt.expand_dims %ileft_218 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158)
353
+ %ileft_220 = tt.broadcast %ileft_219 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159)
354
+ %iright_221 = arith.muli %y_216, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160)
355
+ %iright_222 = "tt.reduce"(%iright_221) <{axis = 1 : i32}> ({
356
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
357
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
358
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
359
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
360
+ %iright_223 = tt.expand_dims %iright_222 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162)
361
+ %iright_224 = tt.broadcast %iright_223 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163)
362
+ %ileft_225 = tt.reshape %ileft_220 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164)
363
+ %iright_226 = tt.reshape %iright_224 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165)
364
+ %y_idx_227 = tt.reshape %new_idxs_215 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166)
365
+ %left_idx_228 = arith.muli %y_idx_227, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168)
366
+ %left_idx_229 = "tt.reduce"(%left_idx_228) <{axis = 1 : i32}> ({
367
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
368
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
369
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
370
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
371
+ %left_idx_230 = tt.expand_dims %left_idx_229 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170)
372
+ %left_idx_231 = tt.broadcast %left_idx_230 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171)
373
+ %right_idx_232 = arith.muli %y_idx_227, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173)
374
+ %right_idx_233 = "tt.reduce"(%right_idx_232) <{axis = 1 : i32}> ({
375
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
376
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
377
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
378
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
379
+ %right_idx_234 = tt.expand_dims %right_idx_233 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175)
380
+ %right_idx_235 = tt.broadcast %right_idx_234 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176)
381
+ %left_idx_236 = tt.reshape %left_idx_231 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177)
382
+ %right_idx_237 = tt.reshape %right_idx_235 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178)
383
+ %cond_238 = arith.cmpi slt, %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc179)
384
+ %eq_239 = arith.cmpi eq, %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc180)
385
+ %cond_240 = arith.cmpi sgt, %left_idx_236, %right_idx_237 : tensor<32x16xi32, #linear> loc(#loc181)
386
+ %cond_241 = arith.andi %eq_239, %cond_240 : tensor<32x16xi1, #linear> loc(#loc182)
387
+ %cond_242 = arith.ori %cond_238, %cond_241 : tensor<32x16xi1, #linear> loc(#loc183)
388
+ %cond_243 = arith.extui %cond_242 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
389
+ %cond_244 = arith.xori %cond_243, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184)
390
+ %cond_245 = arith.cmpi ne, %cond_244, %cst : tensor<32x16xi32, #linear> loc(#loc185)
391
+ %ret_246 = arith.xori %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc186)
392
+ %ret_247 = arith.select %cond_245, %ret_246, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
393
+ %ret_248 = arith.xori %ret_212, %ret_247 : tensor<32x16xi32, #linear> loc(#loc188)
394
+ %new_idxs_249 = arith.xori %left_idx_236, %right_idx_237 : tensor<32x16xi32, #linear> loc(#loc189)
395
+ %new_idxs_250 = arith.select %cond_245, %new_idxs_249, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
396
+ %new_idxs_251 = arith.xori %new_idxs_215, %new_idxs_250 : tensor<32x16xi32, #linear> loc(#loc191)
397
+ %y_252 = tt.reshape %ret_248 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154)
398
+ %ileft_253 = arith.muli %y_252, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156)
399
+ %ileft_254 = "tt.reduce"(%ileft_253) <{axis = 1 : i32}> ({
400
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
401
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
402
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
403
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
404
+ %ileft_255 = tt.expand_dims %ileft_254 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158)
405
+ %ileft_256 = tt.broadcast %ileft_255 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159)
406
+ %iright_257 = arith.muli %y_252, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160)
407
+ %iright_258 = "tt.reduce"(%iright_257) <{axis = 1 : i32}> ({
408
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
409
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
410
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
411
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
412
+ %iright_259 = tt.expand_dims %iright_258 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162)
413
+ %iright_260 = tt.broadcast %iright_259 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163)
414
+ %ileft_261 = tt.reshape %ileft_256 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164)
415
+ %iright_262 = tt.reshape %iright_260 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165)
416
+ %y_idx_263 = tt.reshape %new_idxs_251 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166)
417
+ %left_idx_264 = arith.muli %y_idx_263, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168)
418
+ %left_idx_265 = "tt.reduce"(%left_idx_264) <{axis = 1 : i32}> ({
419
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
420
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
421
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
422
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
423
+ %left_idx_266 = tt.expand_dims %left_idx_265 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170)
424
+ %left_idx_267 = tt.broadcast %left_idx_266 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171)
425
+ %right_idx_268 = arith.muli %y_idx_263, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173)
426
+ %right_idx_269 = "tt.reduce"(%right_idx_268) <{axis = 1 : i32}> ({
427
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
428
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
429
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
430
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
431
+ %right_idx_270 = tt.expand_dims %right_idx_269 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175)
432
+ %right_idx_271 = tt.broadcast %right_idx_270 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176)
433
+ %left_idx_272 = tt.reshape %left_idx_267 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177)
434
+ %right_idx_273 = tt.reshape %right_idx_271 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178)
435
+ %cond_274 = arith.cmpi slt, %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc179)
436
+ %eq_275 = arith.cmpi eq, %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc180)
437
+ %cond_276 = arith.cmpi sgt, %left_idx_272, %right_idx_273 : tensor<32x16xi32, #linear> loc(#loc181)
438
+ %cond_277 = arith.andi %eq_275, %cond_276 : tensor<32x16xi1, #linear> loc(#loc182)
439
+ %cond_278 = arith.ori %cond_274, %cond_277 : tensor<32x16xi1, #linear> loc(#loc183)
440
+ %cond_279 = arith.extui %cond_278 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184)
441
+ %cond_280 = arith.xori %cond_279, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184)
442
+ %cond_281 = arith.cmpi ne, %cond_280, %cst : tensor<32x16xi32, #linear> loc(#loc185)
443
+ %ret_282 = arith.xori %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc186)
444
+ %ret_283 = arith.select %cond_281, %ret_282, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
445
+ %ret_284 = arith.xori %ret_248, %ret_283 : tensor<32x16xi32, #linear> loc(#loc188)
446
+ %new_idxs_285 = arith.xori %left_idx_272, %right_idx_273 : tensor<32x16xi32, #linear> loc(#loc189)
447
+ %new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
448
+ %new_idxs_287 = arith.xori %new_idxs_251, %new_idxs_286 : tensor<32x16xi32, #linear> loc(#loc191)
449
+ %y_288 = tt.reshape %ret_284 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc154)
450
+ %ileft_289 = tt.broadcast %left_mask_53 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc156)
451
+ %ileft_290 = arith.muli %y_288, %ileft_289 : tensor<32x2x8xi32, #linear4> loc(#loc156)
452
+ %ileft_291 = "tt.reduce"(%ileft_290) <{axis = 1 : i32}> ({
453
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
454
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
455
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
456
+ }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc193)
457
+ %ileft_292 = tt.expand_dims %ileft_291 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc158)
458
+ %ileft_293 = tt.broadcast %ileft_292 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc159)
459
+ %iright_294 = arith.muli %y_288, %flip_177 : tensor<32x2x8xi32, #linear4> loc(#loc160)
460
+ %iright_295 = "tt.reduce"(%iright_294) <{axis = 1 : i32}> ({
461
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
462
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
463
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
464
+ }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc195)
465
+ %iright_296 = tt.expand_dims %iright_295 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc162)
466
+ %iright_297 = tt.broadcast %iright_296 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc163)
467
+ %ileft_298 = tt.reshape %ileft_293 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc164)
468
+ %iright_299 = tt.reshape %iright_297 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc165)
469
+ %y_idx_300 = tt.reshape %new_idxs_287 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc166)
470
+ %left_idx_301 = arith.muli %y_idx_300, %ileft_289 : tensor<32x2x8xi32, #linear4> loc(#loc168)
471
+ %left_idx_302 = "tt.reduce"(%left_idx_301) <{axis = 1 : i32}> ({
472
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
473
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
474
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
475
+ }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc198)
476
+ %left_idx_303 = tt.expand_dims %left_idx_302 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc170)
477
+ %left_idx_304 = tt.broadcast %left_idx_303 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc171)
478
+ %right_idx_305 = arith.muli %y_idx_300, %flip_177 : tensor<32x2x8xi32, #linear4> loc(#loc173)
479
+ %right_idx_306 = "tt.reduce"(%right_idx_305) <{axis = 1 : i32}> ({
480
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
481
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
482
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
483
+ }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc201)
484
+ %right_idx_307 = tt.expand_dims %right_idx_306 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc175)
485
+ %right_idx_308 = tt.broadcast %right_idx_307 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc176)
486
+ %left_idx_309 = tt.reshape %left_idx_304 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc177)
487
+ %right_idx_310 = tt.reshape %right_idx_308 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc178)
488
+ %cond_311 = arith.cmpi slt, %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc179)
489
+ %eq_312 = arith.cmpi eq, %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc180)
490
+ %cond_313 = arith.cmpi sgt, %left_idx_309, %right_idx_310 : tensor<32x16xi32, #linear> loc(#loc181)
491
+ %cond_314 = arith.andi %eq_312, %cond_313 : tensor<32x16xi1, #linear> loc(#loc182)
492
+ %cond_315 = arith.ori %cond_311, %cond_314 : tensor<32x16xi1, #linear> loc(#loc183)
493
+ %ret_316 = arith.xori %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc186)
494
+ %ret_317 = arith.select %cond_315, %ret_316, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
495
+ %ret_318 = arith.xori %ret_284, %ret_317 : tensor<32x16xi32, #linear> loc(#loc188)
496
+ %new_idxs_319 = arith.xori %left_idx_309, %right_idx_310 : tensor<32x16xi32, #linear> loc(#loc189)
497
+ %new_idxs_320 = arith.select %cond_315, %new_idxs_319, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
498
+ %new_idxs_321 = arith.xori %new_idxs_287, %new_idxs_320 : tensor<32x16xi32, #linear> loc(#loc191)
499
+ %y_322 = tt.reshape %ret_318 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc154)
500
+ %ileft_323 = arith.muli %y_322, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc156)
501
+ %ileft_324 = "tt.reduce"(%ileft_323) <{axis = 1 : i32}> ({
502
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
503
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
504
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
505
+ }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193)
506
+ %ileft_325 = tt.expand_dims %ileft_324 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc158)
507
+ %ileft_326 = tt.broadcast %ileft_325 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc159)
508
+ %iright_327 = arith.muli %y_322, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc160)
509
+ %iright_328 = "tt.reduce"(%iright_327) <{axis = 1 : i32}> ({
510
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
511
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
512
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
513
+ }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195)
514
+ %iright_329 = tt.expand_dims %iright_328 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc162)
515
+ %iright_330 = tt.broadcast %iright_329 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc163)
516
+ %ileft_331 = tt.reshape %ileft_326 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc164)
517
+ %iright_332 = tt.reshape %iright_330 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc165)
518
+ %y_idx_333 = tt.reshape %new_idxs_321 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc166)
519
+ %left_idx_334 = arith.muli %y_idx_333, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc168)
520
+ %left_idx_335 = "tt.reduce"(%left_idx_334) <{axis = 1 : i32}> ({
521
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
522
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
523
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
524
+ }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198)
525
+ %left_idx_336 = tt.expand_dims %left_idx_335 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc170)
526
+ %left_idx_337 = tt.broadcast %left_idx_336 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc171)
527
+ %right_idx_338 = arith.muli %y_idx_333, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc173)
528
+ %right_idx_339 = "tt.reduce"(%right_idx_338) <{axis = 1 : i32}> ({
529
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
530
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
531
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
532
+ }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201)
533
+ %right_idx_340 = tt.expand_dims %right_idx_339 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc175)
534
+ %right_idx_341 = tt.broadcast %right_idx_340 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc176)
535
+ %left_idx_342 = tt.reshape %left_idx_337 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc177)
536
+ %right_idx_343 = tt.reshape %right_idx_341 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc178)
537
+ %cond_344 = arith.cmpi slt, %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc179)
538
+ %eq_345 = arith.cmpi eq, %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc180)
539
+ %cond_346 = arith.cmpi sgt, %left_idx_342, %right_idx_343 : tensor<32x16xi32, #linear> loc(#loc181)
540
+ %cond_347 = arith.andi %eq_345, %cond_346 : tensor<32x16xi1, #linear> loc(#loc182)
541
+ %cond_348 = arith.ori %cond_344, %cond_347 : tensor<32x16xi1, #linear> loc(#loc183)
542
+ %ret_349 = arith.xori %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc186)
543
+ %ret_350 = arith.select %cond_348, %ret_349, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
544
+ %ret_351 = arith.xori %ret_318, %ret_350 : tensor<32x16xi32, #linear> loc(#loc188)
545
+ %new_idxs_352 = arith.xori %left_idx_342, %right_idx_343 : tensor<32x16xi32, #linear> loc(#loc189)
546
+ %new_idxs_353 = arith.select %cond_348, %new_idxs_352, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
547
+ %new_idxs_354 = arith.xori %new_idxs_321, %new_idxs_353 : tensor<32x16xi32, #linear> loc(#loc191)
548
+ %y_355 = tt.reshape %ret_351 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc154)
549
+ %ileft_356 = arith.muli %y_355, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156)
550
+ %ileft_357 = "tt.reduce"(%ileft_356) <{axis = 1 : i32}> ({
551
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
552
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
553
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
554
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193)
555
+ %ileft_358 = tt.expand_dims %ileft_357 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158)
556
+ %ileft_359 = tt.broadcast %ileft_358 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159)
557
+ %iright_360 = arith.muli %y_355, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160)
558
+ %iright_361 = "tt.reduce"(%iright_360) <{axis = 1 : i32}> ({
559
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
560
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
561
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
562
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195)
563
+ %iright_362 = tt.expand_dims %iright_361 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162)
564
+ %iright_363 = tt.broadcast %iright_362 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163)
565
+ %ileft_364 = tt.reshape %ileft_359 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164)
566
+ %iright_365 = tt.reshape %iright_363 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165)
567
+ %y_idx_366 = tt.reshape %new_idxs_354 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166)
568
+ %left_idx_367 = arith.muli %y_idx_366, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168)
569
+ %left_idx_368 = "tt.reduce"(%left_idx_367) <{axis = 1 : i32}> ({
570
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
571
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
572
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
573
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198)
574
+ %left_idx_369 = tt.expand_dims %left_idx_368 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170)
575
+ %left_idx_370 = tt.broadcast %left_idx_369 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171)
576
+ %right_idx_371 = arith.muli %y_idx_366, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173)
577
+ %right_idx_372 = "tt.reduce"(%right_idx_371) <{axis = 1 : i32}> ({
578
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
579
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
580
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
581
+ }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201)
582
+ %right_idx_373 = tt.expand_dims %right_idx_372 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175)
583
+ %right_idx_374 = tt.broadcast %right_idx_373 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176)
584
+ %left_idx_375 = tt.reshape %left_idx_370 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177)
585
+ %right_idx_376 = tt.reshape %right_idx_374 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178)
586
+ %cond_377 = arith.cmpi slt, %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc179)
587
+ %eq_378 = arith.cmpi eq, %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc180)
588
+ %cond_379 = arith.cmpi sgt, %left_idx_375, %right_idx_376 : tensor<32x16xi32, #linear> loc(#loc181)
589
+ %cond_380 = arith.andi %eq_378, %cond_379 : tensor<32x16xi1, #linear> loc(#loc182)
590
+ %cond_381 = arith.ori %cond_377, %cond_380 : tensor<32x16xi1, #linear> loc(#loc183)
591
+ %ret_382 = arith.xori %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc186)
592
+ %ret_383 = arith.select %cond_381, %ret_382, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187)
593
+ %ret_384 = arith.xori %ret_351, %ret_383 : tensor<32x16xi32, #linear> loc(#loc188)
594
+ %new_idxs_385 = arith.xori %left_idx_375, %right_idx_376 : tensor<32x16xi32, #linear> loc(#loc189)
595
+ %new_idxs_386 = arith.select %cond_381, %new_idxs_385, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
596
+ %new_idxs_387 = arith.xori %new_idxs_354, %new_idxs_386 : tensor<32x16xi32, #linear> loc(#loc191)
597
+ %y_388 = tt.reshape %ret_384 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154)
598
+ %ileft_389 = arith.muli %y_388, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156)
599
+ %ileft_390 = "tt.reduce"(%ileft_389) <{axis = 1 : i32}> ({
600
+ ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))):
601
+ %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203)
602
+ tt.reduce.return %ileft_421 : i32 loc(#loc193)
603
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193)
604
+ %ileft_391 = tt.expand_dims %ileft_390 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158)
605
+ %ileft_392 = tt.broadcast %ileft_391 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159)
606
+ %iright_393 = arith.muli %y_388, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160)
607
+ %iright_394 = "tt.reduce"(%iright_393) <{axis = 1 : i32}> ({
608
+ ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))):
609
+ %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204)
610
+ tt.reduce.return %iright_421 : i32 loc(#loc195)
611
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195)
612
+ %iright_395 = tt.expand_dims %iright_394 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162)
613
+ %iright_396 = tt.broadcast %iright_395 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163)
614
+ %ileft_397 = tt.reshape %ileft_392 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164)
615
+ %iright_398 = tt.reshape %iright_396 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165)
616
+ %y_idx_399 = tt.reshape %new_idxs_387 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166)
617
+ %left_idx_400 = arith.muli %y_idx_399, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168)
618
+ %left_idx_401 = "tt.reduce"(%left_idx_400) <{axis = 1 : i32}> ({
619
+ ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))):
620
+ %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205)
621
+ tt.reduce.return %left_idx_421 : i32 loc(#loc198)
622
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198)
623
+ %left_idx_402 = tt.expand_dims %left_idx_401 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170)
624
+ %left_idx_403 = tt.broadcast %left_idx_402 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171)
625
+ %right_idx_404 = arith.muli %y_idx_399, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173)
626
+ %right_idx_405 = "tt.reduce"(%right_idx_404) <{axis = 1 : i32}> ({
627
+ ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))):
628
+ %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206)
629
+ tt.reduce.return %right_idx_421 : i32 loc(#loc201)
630
+ }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201)
631
+ %right_idx_406 = tt.expand_dims %right_idx_405 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175)
632
+ %right_idx_407 = tt.broadcast %right_idx_406 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176)
633
+ %left_idx_408 = tt.reshape %left_idx_403 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177)
634
+ %right_idx_409 = tt.reshape %right_idx_407 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178)
635
+ %cond_410 = arith.cmpi slt, %ileft_397, %iright_398 : tensor<32x16xi32, #linear> loc(#loc179)
636
+ %eq_411 = arith.cmpi eq, %ileft_397, %iright_398 : tensor<32x16xi32, #linear> loc(#loc180)
637
+ %cond_412 = arith.cmpi sgt, %left_idx_408, %right_idx_409 : tensor<32x16xi32, #linear> loc(#loc181)
638
+ %cond_413 = arith.andi %eq_411, %cond_412 : tensor<32x16xi1, #linear> loc(#loc182)
639
+ %cond_414 = arith.ori %cond_410, %cond_413 : tensor<32x16xi1, #linear> loc(#loc183)
640
+ %new_idxs_415 = arith.xori %left_idx_408, %right_idx_409 : tensor<32x16xi32, #linear> loc(#loc189)
641
+ %new_idxs_416 = arith.select %cond_414, %new_idxs_415, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190)
642
+ %new_idxs_417 = arith.xori %new_idxs_387, %new_idxs_416 : tensor<32x16xi32, #linear> loc(#loc191)
643
+ %tmp7 = arith.extsi %tmp0_36 : tensor<32x16xi32, #blocked> to tensor<32x16xi64, #blocked> loc(#loc141)
644
+ %tmp10 = arith.select %tmp0_34, %tmp7, %cst_0 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc142)
645
+ %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({
646
+ ^bb0(%tmp11_419: i64 loc(callsite(#loc1 at #loc143)), %tmp11_420: i64 loc(callsite(#loc1 at #loc143))):
647
+ %tmp11_421 = arith.addi %tmp11_419, %tmp11_420 : i64 loc(#loc192)
648
+ tt.reduce.return %tmp11_421 : i64 loc(#loc152)
649
+ }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc152)
650
+ %tmp11_418 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc144)
651
+ %tmp14 = arith.trunci %tmp11_418 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc145)
652
+ %0 = arith.muli %xindex_19, %cst_4 : tensor<32x1xi32, #blocked1> loc(#loc70)
653
+ %1 = tt.broadcast %r0_index_25 : tensor<1x16xi32, #blocked1> -> tensor<32x16xi32, #blocked1> loc(#loc71)
654
+ %2 = tt.broadcast %0 : tensor<32x1xi32, #blocked1> -> tensor<32x16xi32, #blocked1> loc(#loc71)
655
+ %3 = arith.addi %1, %2 : tensor<32x16xi32, #blocked1> loc(#loc71)
656
+ %4 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>, #blocked1> loc(#loc72)
657
+ %5 = tt.addptr %4, %3 : tensor<32x16x!tt.ptr<i32>, #blocked1>, tensor<32x16xi32, #blocked1> loc(#loc72)
658
+ %6 = ttg.convert_layout %new_idxs_417 : tensor<32x16xi32, #linear> -> tensor<32x16xi32, #blocked1> loc(#loc73)
659
+ tt.store %5, %6, %tmp0_35 : tensor<32x16x!tt.ptr<i32>, #blocked1> loc(#loc73)
660
+ %7 = tt.splat %out_ptr3 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>, #blocked> loc(#loc74)
661
+ %8 = tt.addptr %7, %xindex_18 : tensor<32x1x!tt.ptr<i32>, #blocked>, tensor<32x1xi32, #blocked> loc(#loc74)
662
+ tt.store %8, %tmp14, %xmask : tensor<32x1x!tt.ptr<i32>, #blocked> loc(#loc75)
663
+ tt.return loc(#loc76)
664
+ } loc(#loc)
665
+ } loc(#loc)
666
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:28)
667
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:33)
668
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:44)
669
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:23)
670
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":26:21)
671
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:38)
672
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":33:19)
673
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":34:19)
674
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:38)
675
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:35)
676
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:49)
677
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:45)
678
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:30)
679
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:54)
680
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":38:19)
681
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":40:33)
682
+ #loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44)
683
+ #loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60)
684
+ #loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68)
685
+ #loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22)
686
+ #loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21)
687
+ #loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40)
688
+ #loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
689
+ #loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
690
+ #loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65)
691
+ #loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78)
692
+ #loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41)
693
+ #loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67)
694
+ #loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80)
695
+ #loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30)
696
+ #loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32)
697
+ #loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29)
698
+ #loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36)
699
+ #loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23)
700
+ #loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25)
701
+ #loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53)
702
+ #loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66)
703
+ #loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37)
704
+ #loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23)
705
+ #loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54)
706
+ #loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67)
707
+ #loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36)
708
+ #loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38)
709
+ #loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22)
710
+ #loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21)
711
+ #loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40)
712
+ #loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29)
713
+ #loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23)
714
+ #loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19)
715
+ #loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28)
716
+ #loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38)
717
+ #loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46)
718
+ #loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15)
719
+ #loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48)
720
+ #loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59)
721
+ #loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22)
722
+ #loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":42:19)
723
+ #loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":44:34)
724
+ #loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:29)
725
+ #loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":48:21)
726
+ #loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:35)
727
+ #loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:32)
728
+ #loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:25)
729
+ #loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:47)
730
+ #loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:25)
731
+ #loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:37)
732
+ #loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:4)
733
+ #loc82 = loc("xoffset"(#loc2))
734
+ #loc83 = loc("xoffset"(#loc3))
735
+ #loc84 = loc("xindex"(#loc4))
736
+ #loc85 = loc("xindex"(#loc5))
737
+ #loc86 = loc("xmask"(#loc6))
738
+ #loc87 = loc("r0_index"(#loc7))
739
+ #loc88 = loc("x0"(#loc8))
740
+ #loc89 = loc("x1"(#loc9))
741
+ #loc90 = loc("tmp0"(#loc10))
742
+ #loc91 = loc("tmp0"(#loc11))
743
+ #loc92 = loc("tmp0"(#loc12))
744
+ #loc93 = loc("tmp0"(#loc13))
745
+ #loc94 = loc("tmp0"(#loc14))
746
+ #loc95 = loc("tmp0"(#loc15))
747
+ #loc96 = loc("tmp2"(#loc16))
748
+ #loc97 = loc("tmp4"(#loc17))
749
+ #loc98 = loc("flip"(#loc18))
750
+ #loc100 = loc("flip"(#loc21))
751
+ #loc101 = loc("flip"(#loc22))
752
+ #loc102 = loc("y"(#loc23))
753
+ #loc103 = loc("left_mask"(#loc25))
754
+ #loc104 = loc("ileft"(#loc26))
755
+ #loc106 = loc("ileft"(#loc30))
756
+ #loc107 = loc("ileft"(#loc31))
757
+ #loc108 = loc("iright"(#loc32))
758
+ #loc110 = loc("iright"(#loc34))
759
+ #loc111 = loc("iright"(#loc35))
760
+ #loc112 = loc("ileft"(#loc36))
761
+ #loc113 = loc("iright"(#loc37))
762
+ #loc114 = loc("y_idx"(#loc38))
763
+ #loc115 = loc("left_idx"(#loc39))
764
+ #loc116 = loc("left_idx"(#loc40))
765
+ #loc117 = loc("input"(#loc41))
766
+ #loc119 = loc("left_idx"(#loc43))
767
+ #loc120 = loc("left_idx"(#loc44))
768
+ #loc121 = loc("right_idx"(#loc45))
769
+ #loc122 = loc("right_idx"(#loc46))
770
+ #loc124 = loc("right_idx"(#loc48))
771
+ #loc125 = loc("right_idx"(#loc49))
772
+ #loc126 = loc("left_idx"(#loc50))
773
+ #loc127 = loc("right_idx"(#loc51))
774
+ #loc128 = loc("cond"(#loc52))
775
+ #loc129 = loc("eq"(#loc53))
776
+ #loc130 = loc("cond"(#loc54))
777
+ #loc131 = loc("cond"(#loc55))
778
+ #loc132 = loc("cond"(#loc56))
779
+ #loc133 = loc("cond"(#loc57))
780
+ #loc134 = loc("cond"(#loc58))
781
+ #loc135 = loc("ret"(#loc59))
782
+ #loc136 = loc("ret"(#loc60))
783
+ #loc137 = loc("ret"(#loc61))
784
+ #loc138 = loc("new_idxs"(#loc62))
785
+ #loc139 = loc("new_idxs"(#loc63))
786
+ #loc140 = loc("new_idxs"(#loc64))
787
+ #loc141 = loc("tmp7"(#loc65))
788
+ #loc142 = loc("tmp10"(#loc66))
789
+ #loc144 = loc("tmp11"(#loc68))
790
+ #loc145 = loc("tmp14"(#loc69))
791
+ #loc146 = loc(callsite(#loc98 at #loc99))
792
+ #loc147 = loc(callsite(#loc100 at #loc99))
793
+ #loc148 = loc(callsite(#loc101 at #loc99))
794
+ #loc150 = loc("cond"(#loc128))
795
+ #loc151 = loc("eq"(#loc129))
796
+ #loc152 = loc(callsite(#loc27 at #loc143))
797
+ #loc154 = loc(callsite(#loc102 at #loc149))
798
+ #loc155 = loc(callsite(#loc103 at #loc149))
799
+ #loc156 = loc(callsite(#loc104 at #loc149))
800
+ #loc158 = loc(callsite(#loc106 at #loc149))
801
+ #loc159 = loc(callsite(#loc107 at #loc149))
802
+ #loc160 = loc(callsite(#loc108 at #loc149))
803
+ #loc162 = loc(callsite(#loc110 at #loc149))
804
+ #loc163 = loc(callsite(#loc111 at #loc149))
805
+ #loc164 = loc(callsite(#loc112 at #loc149))
806
+ #loc165 = loc(callsite(#loc113 at #loc149))
807
+ #loc166 = loc(callsite(#loc114 at #loc149))
808
+ #loc167 = loc(callsite(#loc115 at #loc149))
809
+ #loc168 = loc(callsite(#loc116 at #loc149))
810
+ #loc170 = loc(callsite(#loc119 at #loc149))
811
+ #loc171 = loc(callsite(#loc120 at #loc149))
812
+ #loc172 = loc(callsite(#loc121 at #loc149))
813
+ #loc173 = loc(callsite(#loc122 at #loc149))
814
+ #loc175 = loc(callsite(#loc124 at #loc149))
815
+ #loc176 = loc(callsite(#loc125 at #loc149))
816
+ #loc177 = loc(callsite(#loc126 at #loc149))
817
+ #loc178 = loc(callsite(#loc127 at #loc149))
818
+ #loc179 = loc(callsite(#loc150 at #loc149))
819
+ #loc180 = loc(callsite(#loc151 at #loc149))
820
+ #loc181 = loc(callsite(#loc130 at #loc149))
821
+ #loc182 = loc(callsite(#loc131 at #loc149))
822
+ #loc183 = loc(callsite(#loc132 at #loc149))
823
+ #loc184 = loc(callsite(#loc133 at #loc149))
824
+ #loc185 = loc(callsite(#loc134 at #loc149))
825
+ #loc186 = loc(callsite(#loc135 at #loc149))
826
+ #loc187 = loc(callsite(#loc136 at #loc149))
827
+ #loc188 = loc(callsite(#loc137 at #loc149))
828
+ #loc189 = loc(callsite(#loc138 at #loc149))
829
+ #loc190 = loc(callsite(#loc139 at #loc149))
830
+ #loc191 = loc(callsite(#loc140 at #loc149))
831
+ #loc192 = loc(callsite(#loc29 at #loc152))
832
+ #loc193 = loc(callsite(#loc27 at #loc157))
833
+ #loc195 = loc(callsite(#loc27 at #loc161))
834
+ #loc197 = loc(callsite(#loc117 at #loc169))
835
+ #loc198 = loc(callsite(#loc27 at #loc169))
836
+ #loc200 = loc(callsite(#loc117 at #loc174))
837
+ #loc201 = loc(callsite(#loc27 at #loc174))
838
+ #loc203 = loc(callsite(#loc29 at #loc193))
839
+ #loc204 = loc(callsite(#loc29 at #loc195))
840
+ #loc205 = loc(callsite(#loc29 at #loc198))
841
+ #loc206 = loc(callsite(#loc29 at #loc201))
SpecForge-ext/cache/compiled_kernels/triton/6/4UGGVGDRILN3TLIINH7RHS2I4LFNQRBLGHV63VTW6KCPABX4L4FA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir ADDED
@@ -0,0 +1,799 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":41:67)
4
+ #loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12)
5
+ #loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73)
6
+ #loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51)
7
+ #loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53)
8
+ #loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50)
9
+ #loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51)
10
+ #loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:26)
11
+ #loc80 = loc("in_ptr0"(#loc))
12
+ #loc81 = loc("out_ptr2"(#loc))
13
+ #loc82 = loc("out_ptr3"(#loc))
14
+ #loc83 = loc("xnumel"(#loc))
15
+ #loc84 = loc("r0_numel"(#loc))
16
+ #loc106 = loc(callsite(#loc23 at #loc2))
17
+ #loc113 = loc("ileft"(#loc32))
18
+ #loc117 = loc("iright"(#loc37))
19
+ #loc126 = loc("left_idx"(#loc46))
20
+ #loc131 = loc("right_idx"(#loc51))
21
+ #loc150 = loc("tmp11"(#loc70))
22
+ #loc157 = loc(callsite(#loc28 at #loc106))
23
+ #loc161 = loc(callsite(#loc1 at #loc150))
24
+ #loc165 = loc(callsite(#loc113 at #loc157))
25
+ #loc169 = loc(callsite(#loc117 at #loc157))
26
+ #loc177 = loc(callsite(#loc126 at #loc157))
27
+ #loc182 = loc(callsite(#loc131 at #loc157))
28
+ #loc202 = loc(callsite(#loc1 at #loc165))
29
+ #loc204 = loc(callsite(#loc1 at #loc169))
30
+ #loc207 = loc(callsite(#loc1 at #loc177))
31
+ #loc210 = loc(callsite(#loc1 at #loc182))
32
+ module {
33
+ tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
34
+ %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc85)
35
+ %cst_0 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1)
36
+ %tmp10 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc86)
37
+ %tmp0 = arith.constant dense<272> : tensor<32x1xi32> loc(#loc87)
38
+ %tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc88)
39
+ %cst_2 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc1)
40
+ %xmask = arith.constant dense<128> : tensor<32x1xi32> loc(#loc89)
41
+ %c32_i32 = arith.constant 32 : i32 loc(#loc1)
42
+ %xoffset = tt.get_program_id x : i32 loc(#loc90)
43
+ %xoffset_3 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc91)
44
+ %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc92)
45
+ %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc93)
46
+ %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<32x1xi32> loc(#loc94)
47
+ %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<32x1xi32> loc(#loc94)
48
+ %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<32x1xi32> loc(#loc89)
49
+ %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc95)
50
+ %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc96)
51
+ %x0 = arith.remsi %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc97)
52
+ %x1 = arith.divsi %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc98)
53
+ %tmp0_9 = arith.muli %r0_index_8, %tmp0_1 : tensor<1x16xi32> loc(#loc88)
54
+ %tmp0_10 = tt.broadcast %x0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc99)
55
+ %tmp0_11 = tt.broadcast %tmp0_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc99)
56
+ %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<32x16xi32> loc(#loc99)
57
+ %tmp0_13 = arith.muli %x1, %tmp0 : tensor<32x1xi32> loc(#loc87)
58
+ %tmp0_14 = tt.broadcast %tmp0_13 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc100)
59
+ %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<32x16xi32> loc(#loc100)
60
+ %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>> loc(#loc101)
61
+ %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<32x16x!tt.ptr<i32>>, tensor<32x16xi32> loc(#loc101)
62
+ %tmp0_18 = tt.broadcast %xmask_7 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc102)
63
+ %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %cst_0 : tensor<32x16x!tt.ptr<i32>> loc(#loc102)
64
+ %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc103)
65
+ %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc104)
66
+ %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc153)
67
+ %flip_20 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc154)
68
+ %flip_21 = tt.expand_dims %flip_20 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc154)
69
+ %flip_22 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc155)
70
+ %flip_23 = tt.reshape %flip_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc156)
71
+ %y = tt.reshape %tmp0_19 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162)
72
+ %left_mask = arith.subi %cst, %flip_21 : tensor<1x2x1xi32> loc(#loc163)
73
+ %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc164)
74
+ %ileft_24 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc164)
75
+ %ileft_25 = "tt.reduce"(%ileft_24) <{axis = 1 : i32}> ({
76
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
77
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
78
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
79
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201)
80
+ %ileft_26 = tt.expand_dims %ileft_25 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166)
81
+ %ileft_27 = tt.broadcast %ileft_26 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167)
82
+ %iright = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc168)
83
+ %iright_28 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc168)
84
+ %iright_29 = "tt.reduce"(%iright_28) <{axis = 1 : i32}> ({
85
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
86
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
87
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
88
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203)
89
+ %iright_30 = tt.expand_dims %iright_29 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170)
90
+ %iright_31 = tt.broadcast %iright_30 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171)
91
+ %ileft_32 = tt.reshape %ileft_27 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172)
92
+ %iright_33 = tt.reshape %iright_31 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173)
93
+ %y_idx = tt.reshape %tmp4 : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc174)
94
+ %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc175)
95
+ %left_idx_34 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc176)
96
+ %left_idx_35 = arith.muli %y_idx, %left_idx_34 : tensor<256x2x1xi16> loc(#loc176)
97
+ %input = arith.extsi %left_idx_35 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc205)
98
+ %left_idx_36 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
99
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
100
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
101
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
102
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206)
103
+ %left_idx_37 = tt.expand_dims %left_idx_36 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178)
104
+ %left_idx_38 = tt.broadcast %left_idx_37 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179)
105
+ %right_idx = arith.trunci %flip_21 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc180)
106
+ %right_idx_39 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc181)
107
+ %right_idx_40 = arith.muli %y_idx, %right_idx_39 : tensor<256x2x1xi16> loc(#loc181)
108
+ %input_41 = arith.extsi %right_idx_40 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc208)
109
+ %right_idx_42 = "tt.reduce"(%input_41) <{axis = 1 : i32}> ({
110
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
111
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
112
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
113
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209)
114
+ %right_idx_43 = tt.expand_dims %right_idx_42 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183)
115
+ %right_idx_44 = tt.broadcast %right_idx_43 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184)
116
+ %left_idx_45 = tt.reshape %left_idx_38 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185)
117
+ %right_idx_46 = tt.reshape %right_idx_44 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186)
118
+ %cond = arith.cmpi slt, %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc187)
119
+ %eq = arith.cmpi eq, %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc188)
120
+ %cond_47 = arith.cmpi sgt, %left_idx_45, %right_idx_46 : tensor<32x16xi32> loc(#loc189)
121
+ %cond_48 = arith.andi %eq, %cond_47 : tensor<32x16xi1> loc(#loc190)
122
+ %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc191)
123
+ %cond_50 = arith.extui %cond_49 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
124
+ %cond_51 = arith.xori %cond_50, %flip_23 : tensor<32x16xi32> loc(#loc192)
125
+ %cond_52 = arith.cmpi ne, %cond_51, %cst_0 : tensor<32x16xi32> loc(#loc193)
126
+ %ret = arith.xori %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc194)
127
+ %ret_53 = arith.select %cond_52, %ret, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
128
+ %ret_54 = arith.xori %tmp0_19, %ret_53 : tensor<32x16xi32> loc(#loc196)
129
+ %new_idxs = arith.xori %left_idx_45, %right_idx_46 : tensor<32x16xi32> loc(#loc197)
130
+ %new_idxs_55 = arith.select %cond_52, %new_idxs, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
131
+ %new_idxs_56 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc199)
132
+ %new_idxs_57 = tt.broadcast %new_idxs_56 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc199)
133
+ %new_idxs_58 = arith.xori %new_idxs_57, %new_idxs_55 : tensor<32x16xi32> loc(#loc199)
134
+ %flip_59 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc155)
135
+ %flip_60 = tt.reshape %flip_59 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc156)
136
+ %y_61 = tt.reshape %ret_54 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162)
137
+ %ileft_62 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc164)
138
+ %ileft_63 = arith.muli %y_61, %ileft_62 : tensor<128x2x2xi32> loc(#loc164)
139
+ %ileft_64 = "tt.reduce"(%ileft_63) <{axis = 1 : i32}> ({
140
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
141
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
142
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
143
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201)
144
+ %ileft_65 = tt.expand_dims %ileft_64 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166)
145
+ %ileft_66 = tt.broadcast %ileft_65 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167)
146
+ %iright_67 = arith.muli %y_61, %flip_22 : tensor<128x2x2xi32> loc(#loc168)
147
+ %iright_68 = "tt.reduce"(%iright_67) <{axis = 1 : i32}> ({
148
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
149
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
150
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
151
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203)
152
+ %iright_69 = tt.expand_dims %iright_68 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170)
153
+ %iright_70 = tt.broadcast %iright_69 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171)
154
+ %ileft_71 = tt.reshape %ileft_66 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172)
155
+ %iright_72 = tt.reshape %iright_70 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173)
156
+ %y_idx_73 = tt.reshape %new_idxs_58 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174)
157
+ %left_idx_74 = arith.muli %y_idx_73, %ileft_62 : tensor<128x2x2xi32> loc(#loc176)
158
+ %left_idx_75 = "tt.reduce"(%left_idx_74) <{axis = 1 : i32}> ({
159
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
160
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
161
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
162
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206)
163
+ %left_idx_76 = tt.expand_dims %left_idx_75 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178)
164
+ %left_idx_77 = tt.broadcast %left_idx_76 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179)
165
+ %right_idx_78 = arith.muli %y_idx_73, %flip_22 : tensor<128x2x2xi32> loc(#loc181)
166
+ %right_idx_79 = "tt.reduce"(%right_idx_78) <{axis = 1 : i32}> ({
167
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
168
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
169
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
170
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209)
171
+ %right_idx_80 = tt.expand_dims %right_idx_79 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183)
172
+ %right_idx_81 = tt.broadcast %right_idx_80 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184)
173
+ %left_idx_82 = tt.reshape %left_idx_77 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185)
174
+ %right_idx_83 = tt.reshape %right_idx_81 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186)
175
+ %cond_84 = arith.cmpi slt, %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc187)
176
+ %eq_85 = arith.cmpi eq, %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc188)
177
+ %cond_86 = arith.cmpi sgt, %left_idx_82, %right_idx_83 : tensor<32x16xi32> loc(#loc189)
178
+ %cond_87 = arith.andi %eq_85, %cond_86 : tensor<32x16xi1> loc(#loc190)
179
+ %cond_88 = arith.ori %cond_84, %cond_87 : tensor<32x16xi1> loc(#loc191)
180
+ %cond_89 = arith.extui %cond_88 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
181
+ %cond_90 = arith.xori %cond_89, %flip_60 : tensor<32x16xi32> loc(#loc192)
182
+ %cond_91 = arith.cmpi ne, %cond_90, %cst_0 : tensor<32x16xi32> loc(#loc193)
183
+ %ret_92 = arith.xori %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc194)
184
+ %ret_93 = arith.select %cond_91, %ret_92, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
185
+ %ret_94 = arith.xori %ret_54, %ret_93 : tensor<32x16xi32> loc(#loc196)
186
+ %new_idxs_95 = arith.xori %left_idx_82, %right_idx_83 : tensor<32x16xi32> loc(#loc197)
187
+ %new_idxs_96 = arith.select %cond_91, %new_idxs_95, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
188
+ %new_idxs_97 = arith.xori %new_idxs_58, %new_idxs_96 : tensor<32x16xi32> loc(#loc199)
189
+ %y_98 = tt.reshape %ret_94 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162)
190
+ %ileft_99 = arith.muli %y_98, %ileft : tensor<256x2x1xi32> loc(#loc164)
191
+ %ileft_100 = "tt.reduce"(%ileft_99) <{axis = 1 : i32}> ({
192
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
193
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
194
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
195
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201)
196
+ %ileft_101 = tt.expand_dims %ileft_100 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166)
197
+ %ileft_102 = tt.broadcast %ileft_101 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167)
198
+ %iright_103 = arith.muli %y_98, %iright : tensor<256x2x1xi32> loc(#loc168)
199
+ %iright_104 = "tt.reduce"(%iright_103) <{axis = 1 : i32}> ({
200
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
201
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
202
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
203
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203)
204
+ %iright_105 = tt.expand_dims %iright_104 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170)
205
+ %iright_106 = tt.broadcast %iright_105 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171)
206
+ %ileft_107 = tt.reshape %ileft_102 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172)
207
+ %iright_108 = tt.reshape %iright_106 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173)
208
+ %y_idx_109 = tt.reshape %new_idxs_97 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174)
209
+ %left_idx_110 = arith.muli %y_idx_109, %ileft : tensor<256x2x1xi32> loc(#loc176)
210
+ %left_idx_111 = "tt.reduce"(%left_idx_110) <{axis = 1 : i32}> ({
211
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
212
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
213
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
214
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206)
215
+ %left_idx_112 = tt.expand_dims %left_idx_111 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178)
216
+ %left_idx_113 = tt.broadcast %left_idx_112 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179)
217
+ %right_idx_114 = arith.muli %y_idx_109, %iright : tensor<256x2x1xi32> loc(#loc181)
218
+ %right_idx_115 = "tt.reduce"(%right_idx_114) <{axis = 1 : i32}> ({
219
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
220
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
221
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
222
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209)
223
+ %right_idx_116 = tt.expand_dims %right_idx_115 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183)
224
+ %right_idx_117 = tt.broadcast %right_idx_116 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184)
225
+ %left_idx_118 = tt.reshape %left_idx_113 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185)
226
+ %right_idx_119 = tt.reshape %right_idx_117 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186)
227
+ %cond_120 = arith.cmpi slt, %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc187)
228
+ %eq_121 = arith.cmpi eq, %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc188)
229
+ %cond_122 = arith.cmpi sgt, %left_idx_118, %right_idx_119 : tensor<32x16xi32> loc(#loc189)
230
+ %cond_123 = arith.andi %eq_121, %cond_122 : tensor<32x16xi1> loc(#loc190)
231
+ %cond_124 = arith.ori %cond_120, %cond_123 : tensor<32x16xi1> loc(#loc191)
232
+ %cond_125 = arith.extui %cond_124 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
233
+ %cond_126 = arith.xori %cond_125, %flip_60 : tensor<32x16xi32> loc(#loc192)
234
+ %cond_127 = arith.cmpi ne, %cond_126, %cst_0 : tensor<32x16xi32> loc(#loc193)
235
+ %ret_128 = arith.xori %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc194)
236
+ %ret_129 = arith.select %cond_127, %ret_128, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
237
+ %ret_130 = arith.xori %ret_94, %ret_129 : tensor<32x16xi32> loc(#loc196)
238
+ %new_idxs_131 = arith.xori %left_idx_118, %right_idx_119 : tensor<32x16xi32> loc(#loc197)
239
+ %new_idxs_132 = arith.select %cond_127, %new_idxs_131, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
240
+ %new_idxs_133 = arith.xori %new_idxs_97, %new_idxs_132 : tensor<32x16xi32> loc(#loc199)
241
+ %flip_134 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc155)
242
+ %flip_135 = tt.reshape %flip_134 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc156)
243
+ %y_136 = tt.reshape %ret_130 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc162)
244
+ %ileft_137 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc164)
245
+ %ileft_138 = arith.muli %y_136, %ileft_137 : tensor<64x2x4xi32> loc(#loc164)
246
+ %ileft_139 = "tt.reduce"(%ileft_138) <{axis = 1 : i32}> ({
247
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
248
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
249
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
250
+ }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc201)
251
+ %ileft_140 = tt.expand_dims %ileft_139 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc166)
252
+ %ileft_141 = tt.broadcast %ileft_140 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc167)
253
+ %iright_142 = arith.muli %y_136, %flip_59 : tensor<64x2x4xi32> loc(#loc168)
254
+ %iright_143 = "tt.reduce"(%iright_142) <{axis = 1 : i32}> ({
255
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
256
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
257
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
258
+ }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc203)
259
+ %iright_144 = tt.expand_dims %iright_143 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc170)
260
+ %iright_145 = tt.broadcast %iright_144 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc171)
261
+ %ileft_146 = tt.reshape %ileft_141 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc172)
262
+ %iright_147 = tt.reshape %iright_145 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc173)
263
+ %y_idx_148 = tt.reshape %new_idxs_133 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc174)
264
+ %left_idx_149 = arith.muli %y_idx_148, %ileft_137 : tensor<64x2x4xi32> loc(#loc176)
265
+ %left_idx_150 = "tt.reduce"(%left_idx_149) <{axis = 1 : i32}> ({
266
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
267
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
268
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
269
+ }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc206)
270
+ %left_idx_151 = tt.expand_dims %left_idx_150 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc178)
271
+ %left_idx_152 = tt.broadcast %left_idx_151 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc179)
272
+ %right_idx_153 = arith.muli %y_idx_148, %flip_59 : tensor<64x2x4xi32> loc(#loc181)
273
+ %right_idx_154 = "tt.reduce"(%right_idx_153) <{axis = 1 : i32}> ({
274
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
275
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
276
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
277
+ }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc209)
278
+ %right_idx_155 = tt.expand_dims %right_idx_154 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc183)
279
+ %right_idx_156 = tt.broadcast %right_idx_155 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc184)
280
+ %left_idx_157 = tt.reshape %left_idx_152 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc185)
281
+ %right_idx_158 = tt.reshape %right_idx_156 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc186)
282
+ %cond_159 = arith.cmpi slt, %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc187)
283
+ %eq_160 = arith.cmpi eq, %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc188)
284
+ %cond_161 = arith.cmpi sgt, %left_idx_157, %right_idx_158 : tensor<32x16xi32> loc(#loc189)
285
+ %cond_162 = arith.andi %eq_160, %cond_161 : tensor<32x16xi1> loc(#loc190)
286
+ %cond_163 = arith.ori %cond_159, %cond_162 : tensor<32x16xi1> loc(#loc191)
287
+ %cond_164 = arith.extui %cond_163 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
288
+ %cond_165 = arith.xori %cond_164, %flip_135 : tensor<32x16xi32> loc(#loc192)
289
+ %cond_166 = arith.cmpi ne, %cond_165, %cst_0 : tensor<32x16xi32> loc(#loc193)
290
+ %ret_167 = arith.xori %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc194)
291
+ %ret_168 = arith.select %cond_166, %ret_167, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
292
+ %ret_169 = arith.xori %ret_130, %ret_168 : tensor<32x16xi32> loc(#loc196)
293
+ %new_idxs_170 = arith.xori %left_idx_157, %right_idx_158 : tensor<32x16xi32> loc(#loc197)
294
+ %new_idxs_171 = arith.select %cond_166, %new_idxs_170, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
295
+ %new_idxs_172 = arith.xori %new_idxs_133, %new_idxs_171 : tensor<32x16xi32> loc(#loc199)
296
+ %y_173 = tt.reshape %ret_169 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162)
297
+ %ileft_174 = arith.muli %y_173, %ileft_62 : tensor<128x2x2xi32> loc(#loc164)
298
+ %ileft_175 = "tt.reduce"(%ileft_174) <{axis = 1 : i32}> ({
299
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
300
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
301
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
302
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201)
303
+ %ileft_176 = tt.expand_dims %ileft_175 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166)
304
+ %ileft_177 = tt.broadcast %ileft_176 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167)
305
+ %iright_178 = arith.muli %y_173, %flip_22 : tensor<128x2x2xi32> loc(#loc168)
306
+ %iright_179 = "tt.reduce"(%iright_178) <{axis = 1 : i32}> ({
307
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
308
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
309
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
310
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203)
311
+ %iright_180 = tt.expand_dims %iright_179 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170)
312
+ %iright_181 = tt.broadcast %iright_180 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171)
313
+ %ileft_182 = tt.reshape %ileft_177 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172)
314
+ %iright_183 = tt.reshape %iright_181 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173)
315
+ %y_idx_184 = tt.reshape %new_idxs_172 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174)
316
+ %left_idx_185 = arith.muli %y_idx_184, %ileft_62 : tensor<128x2x2xi32> loc(#loc176)
317
+ %left_idx_186 = "tt.reduce"(%left_idx_185) <{axis = 1 : i32}> ({
318
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
319
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
320
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
321
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206)
322
+ %left_idx_187 = tt.expand_dims %left_idx_186 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178)
323
+ %left_idx_188 = tt.broadcast %left_idx_187 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179)
324
+ %right_idx_189 = arith.muli %y_idx_184, %flip_22 : tensor<128x2x2xi32> loc(#loc181)
325
+ %right_idx_190 = "tt.reduce"(%right_idx_189) <{axis = 1 : i32}> ({
326
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
327
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
328
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
329
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209)
330
+ %right_idx_191 = tt.expand_dims %right_idx_190 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183)
331
+ %right_idx_192 = tt.broadcast %right_idx_191 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184)
332
+ %left_idx_193 = tt.reshape %left_idx_188 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185)
333
+ %right_idx_194 = tt.reshape %right_idx_192 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186)
334
+ %cond_195 = arith.cmpi slt, %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc187)
335
+ %eq_196 = arith.cmpi eq, %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc188)
336
+ %cond_197 = arith.cmpi sgt, %left_idx_193, %right_idx_194 : tensor<32x16xi32> loc(#loc189)
337
+ %cond_198 = arith.andi %eq_196, %cond_197 : tensor<32x16xi1> loc(#loc190)
338
+ %cond_199 = arith.ori %cond_195, %cond_198 : tensor<32x16xi1> loc(#loc191)
339
+ %cond_200 = arith.extui %cond_199 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
340
+ %cond_201 = arith.xori %cond_200, %flip_135 : tensor<32x16xi32> loc(#loc192)
341
+ %cond_202 = arith.cmpi ne, %cond_201, %cst_0 : tensor<32x16xi32> loc(#loc193)
342
+ %ret_203 = arith.xori %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc194)
343
+ %ret_204 = arith.select %cond_202, %ret_203, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
344
+ %ret_205 = arith.xori %ret_169, %ret_204 : tensor<32x16xi32> loc(#loc196)
345
+ %new_idxs_206 = arith.xori %left_idx_193, %right_idx_194 : tensor<32x16xi32> loc(#loc197)
346
+ %new_idxs_207 = arith.select %cond_202, %new_idxs_206, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
347
+ %new_idxs_208 = arith.xori %new_idxs_172, %new_idxs_207 : tensor<32x16xi32> loc(#loc199)
348
+ %y_209 = tt.reshape %ret_205 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162)
349
+ %ileft_210 = arith.muli %y_209, %ileft : tensor<256x2x1xi32> loc(#loc164)
350
+ %ileft_211 = "tt.reduce"(%ileft_210) <{axis = 1 : i32}> ({
351
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
352
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
353
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
354
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201)
355
+ %ileft_212 = tt.expand_dims %ileft_211 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166)
356
+ %ileft_213 = tt.broadcast %ileft_212 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167)
357
+ %iright_214 = arith.muli %y_209, %iright : tensor<256x2x1xi32> loc(#loc168)
358
+ %iright_215 = "tt.reduce"(%iright_214) <{axis = 1 : i32}> ({
359
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
360
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
361
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
362
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203)
363
+ %iright_216 = tt.expand_dims %iright_215 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170)
364
+ %iright_217 = tt.broadcast %iright_216 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171)
365
+ %ileft_218 = tt.reshape %ileft_213 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172)
366
+ %iright_219 = tt.reshape %iright_217 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173)
367
+ %y_idx_220 = tt.reshape %new_idxs_208 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174)
368
+ %left_idx_221 = arith.muli %y_idx_220, %ileft : tensor<256x2x1xi32> loc(#loc176)
369
+ %left_idx_222 = "tt.reduce"(%left_idx_221) <{axis = 1 : i32}> ({
370
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
371
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
372
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
373
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206)
374
+ %left_idx_223 = tt.expand_dims %left_idx_222 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178)
375
+ %left_idx_224 = tt.broadcast %left_idx_223 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179)
376
+ %right_idx_225 = arith.muli %y_idx_220, %iright : tensor<256x2x1xi32> loc(#loc181)
377
+ %right_idx_226 = "tt.reduce"(%right_idx_225) <{axis = 1 : i32}> ({
378
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
379
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
380
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
381
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209)
382
+ %right_idx_227 = tt.expand_dims %right_idx_226 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183)
383
+ %right_idx_228 = tt.broadcast %right_idx_227 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184)
384
+ %left_idx_229 = tt.reshape %left_idx_224 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185)
385
+ %right_idx_230 = tt.reshape %right_idx_228 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186)
386
+ %cond_231 = arith.cmpi slt, %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc187)
387
+ %eq_232 = arith.cmpi eq, %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc188)
388
+ %cond_233 = arith.cmpi sgt, %left_idx_229, %right_idx_230 : tensor<32x16xi32> loc(#loc189)
389
+ %cond_234 = arith.andi %eq_232, %cond_233 : tensor<32x16xi1> loc(#loc190)
390
+ %cond_235 = arith.ori %cond_231, %cond_234 : tensor<32x16xi1> loc(#loc191)
391
+ %cond_236 = arith.extui %cond_235 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192)
392
+ %cond_237 = arith.xori %cond_236, %flip_135 : tensor<32x16xi32> loc(#loc192)
393
+ %cond_238 = arith.cmpi ne, %cond_237, %cst_0 : tensor<32x16xi32> loc(#loc193)
394
+ %ret_239 = arith.xori %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc194)
395
+ %ret_240 = arith.select %cond_238, %ret_239, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
396
+ %ret_241 = arith.xori %ret_205, %ret_240 : tensor<32x16xi32> loc(#loc196)
397
+ %new_idxs_242 = arith.xori %left_idx_229, %right_idx_230 : tensor<32x16xi32> loc(#loc197)
398
+ %new_idxs_243 = arith.select %cond_238, %new_idxs_242, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
399
+ %new_idxs_244 = arith.xori %new_idxs_208, %new_idxs_243 : tensor<32x16xi32> loc(#loc199)
400
+ %y_245 = tt.reshape %ret_241 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc162)
401
+ %ileft_246 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc164)
402
+ %ileft_247 = arith.muli %y_245, %ileft_246 : tensor<32x2x8xi32> loc(#loc164)
403
+ %ileft_248 = "tt.reduce"(%ileft_247) <{axis = 1 : i32}> ({
404
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
405
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
406
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
407
+ }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc201)
408
+ %ileft_249 = tt.expand_dims %ileft_248 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc166)
409
+ %ileft_250 = tt.broadcast %ileft_249 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc167)
410
+ %iright_251 = arith.muli %y_245, %flip_134 : tensor<32x2x8xi32> loc(#loc168)
411
+ %iright_252 = "tt.reduce"(%iright_251) <{axis = 1 : i32}> ({
412
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
413
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
414
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
415
+ }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc203)
416
+ %iright_253 = tt.expand_dims %iright_252 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc170)
417
+ %iright_254 = tt.broadcast %iright_253 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc171)
418
+ %ileft_255 = tt.reshape %ileft_250 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc172)
419
+ %iright_256 = tt.reshape %iright_254 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc173)
420
+ %y_idx_257 = tt.reshape %new_idxs_244 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc174)
421
+ %left_idx_258 = arith.muli %y_idx_257, %ileft_246 : tensor<32x2x8xi32> loc(#loc176)
422
+ %left_idx_259 = "tt.reduce"(%left_idx_258) <{axis = 1 : i32}> ({
423
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
424
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
425
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
426
+ }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc206)
427
+ %left_idx_260 = tt.expand_dims %left_idx_259 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc178)
428
+ %left_idx_261 = tt.broadcast %left_idx_260 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc179)
429
+ %right_idx_262 = arith.muli %y_idx_257, %flip_134 : tensor<32x2x8xi32> loc(#loc181)
430
+ %right_idx_263 = "tt.reduce"(%right_idx_262) <{axis = 1 : i32}> ({
431
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
432
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
433
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
434
+ }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc209)
435
+ %right_idx_264 = tt.expand_dims %right_idx_263 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc183)
436
+ %right_idx_265 = tt.broadcast %right_idx_264 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc184)
437
+ %left_idx_266 = tt.reshape %left_idx_261 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc185)
438
+ %right_idx_267 = tt.reshape %right_idx_265 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc186)
439
+ %cond_268 = arith.cmpi slt, %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc187)
440
+ %eq_269 = arith.cmpi eq, %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc188)
441
+ %cond_270 = arith.cmpi sgt, %left_idx_266, %right_idx_267 : tensor<32x16xi32> loc(#loc189)
442
+ %cond_271 = arith.andi %eq_269, %cond_270 : tensor<32x16xi1> loc(#loc190)
443
+ %cond_272 = arith.ori %cond_268, %cond_271 : tensor<32x16xi1> loc(#loc191)
444
+ %ret_273 = arith.xori %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc194)
445
+ %ret_274 = arith.select %cond_272, %ret_273, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
446
+ %ret_275 = arith.xori %ret_241, %ret_274 : tensor<32x16xi32> loc(#loc196)
447
+ %new_idxs_276 = arith.xori %left_idx_266, %right_idx_267 : tensor<32x16xi32> loc(#loc197)
448
+ %new_idxs_277 = arith.select %cond_272, %new_idxs_276, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
449
+ %new_idxs_278 = arith.xori %new_idxs_244, %new_idxs_277 : tensor<32x16xi32> loc(#loc199)
450
+ %y_279 = tt.reshape %ret_275 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc162)
451
+ %ileft_280 = arith.muli %y_279, %ileft_137 : tensor<64x2x4xi32> loc(#loc164)
452
+ %ileft_281 = "tt.reduce"(%ileft_280) <{axis = 1 : i32}> ({
453
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
454
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
455
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
456
+ }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc201)
457
+ %ileft_282 = tt.expand_dims %ileft_281 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc166)
458
+ %ileft_283 = tt.broadcast %ileft_282 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc167)
459
+ %iright_284 = arith.muli %y_279, %flip_59 : tensor<64x2x4xi32> loc(#loc168)
460
+ %iright_285 = "tt.reduce"(%iright_284) <{axis = 1 : i32}> ({
461
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
462
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
463
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
464
+ }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc203)
465
+ %iright_286 = tt.expand_dims %iright_285 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc170)
466
+ %iright_287 = tt.broadcast %iright_286 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc171)
467
+ %ileft_288 = tt.reshape %ileft_283 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc172)
468
+ %iright_289 = tt.reshape %iright_287 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc173)
469
+ %y_idx_290 = tt.reshape %new_idxs_278 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc174)
470
+ %left_idx_291 = arith.muli %y_idx_290, %ileft_137 : tensor<64x2x4xi32> loc(#loc176)
471
+ %left_idx_292 = "tt.reduce"(%left_idx_291) <{axis = 1 : i32}> ({
472
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
473
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
474
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
475
+ }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc206)
476
+ %left_idx_293 = tt.expand_dims %left_idx_292 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc178)
477
+ %left_idx_294 = tt.broadcast %left_idx_293 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc179)
478
+ %right_idx_295 = arith.muli %y_idx_290, %flip_59 : tensor<64x2x4xi32> loc(#loc181)
479
+ %right_idx_296 = "tt.reduce"(%right_idx_295) <{axis = 1 : i32}> ({
480
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
481
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
482
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
483
+ }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc209)
484
+ %right_idx_297 = tt.expand_dims %right_idx_296 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc183)
485
+ %right_idx_298 = tt.broadcast %right_idx_297 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc184)
486
+ %left_idx_299 = tt.reshape %left_idx_294 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc185)
487
+ %right_idx_300 = tt.reshape %right_idx_298 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc186)
488
+ %cond_301 = arith.cmpi slt, %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc187)
489
+ %eq_302 = arith.cmpi eq, %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc188)
490
+ %cond_303 = arith.cmpi sgt, %left_idx_299, %right_idx_300 : tensor<32x16xi32> loc(#loc189)
491
+ %cond_304 = arith.andi %eq_302, %cond_303 : tensor<32x16xi1> loc(#loc190)
492
+ %cond_305 = arith.ori %cond_301, %cond_304 : tensor<32x16xi1> loc(#loc191)
493
+ %ret_306 = arith.xori %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc194)
494
+ %ret_307 = arith.select %cond_305, %ret_306, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
495
+ %ret_308 = arith.xori %ret_275, %ret_307 : tensor<32x16xi32> loc(#loc196)
496
+ %new_idxs_309 = arith.xori %left_idx_299, %right_idx_300 : tensor<32x16xi32> loc(#loc197)
497
+ %new_idxs_310 = arith.select %cond_305, %new_idxs_309, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
498
+ %new_idxs_311 = arith.xori %new_idxs_278, %new_idxs_310 : tensor<32x16xi32> loc(#loc199)
499
+ %y_312 = tt.reshape %ret_308 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162)
500
+ %ileft_313 = arith.muli %y_312, %ileft_62 : tensor<128x2x2xi32> loc(#loc164)
501
+ %ileft_314 = "tt.reduce"(%ileft_313) <{axis = 1 : i32}> ({
502
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
503
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
504
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
505
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201)
506
+ %ileft_315 = tt.expand_dims %ileft_314 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166)
507
+ %ileft_316 = tt.broadcast %ileft_315 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167)
508
+ %iright_317 = arith.muli %y_312, %flip_22 : tensor<128x2x2xi32> loc(#loc168)
509
+ %iright_318 = "tt.reduce"(%iright_317) <{axis = 1 : i32}> ({
510
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
511
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
512
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
513
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203)
514
+ %iright_319 = tt.expand_dims %iright_318 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170)
515
+ %iright_320 = tt.broadcast %iright_319 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171)
516
+ %ileft_321 = tt.reshape %ileft_316 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172)
517
+ %iright_322 = tt.reshape %iright_320 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173)
518
+ %y_idx_323 = tt.reshape %new_idxs_311 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174)
519
+ %left_idx_324 = arith.muli %y_idx_323, %ileft_62 : tensor<128x2x2xi32> loc(#loc176)
520
+ %left_idx_325 = "tt.reduce"(%left_idx_324) <{axis = 1 : i32}> ({
521
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
522
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
523
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
524
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206)
525
+ %left_idx_326 = tt.expand_dims %left_idx_325 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178)
526
+ %left_idx_327 = tt.broadcast %left_idx_326 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179)
527
+ %right_idx_328 = arith.muli %y_idx_323, %flip_22 : tensor<128x2x2xi32> loc(#loc181)
528
+ %right_idx_329 = "tt.reduce"(%right_idx_328) <{axis = 1 : i32}> ({
529
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
530
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
531
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
532
+ }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209)
533
+ %right_idx_330 = tt.expand_dims %right_idx_329 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183)
534
+ %right_idx_331 = tt.broadcast %right_idx_330 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184)
535
+ %left_idx_332 = tt.reshape %left_idx_327 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185)
536
+ %right_idx_333 = tt.reshape %right_idx_331 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186)
537
+ %cond_334 = arith.cmpi slt, %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc187)
538
+ %eq_335 = arith.cmpi eq, %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc188)
539
+ %cond_336 = arith.cmpi sgt, %left_idx_332, %right_idx_333 : tensor<32x16xi32> loc(#loc189)
540
+ %cond_337 = arith.andi %eq_335, %cond_336 : tensor<32x16xi1> loc(#loc190)
541
+ %cond_338 = arith.ori %cond_334, %cond_337 : tensor<32x16xi1> loc(#loc191)
542
+ %ret_339 = arith.xori %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc194)
543
+ %ret_340 = arith.select %cond_338, %ret_339, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195)
544
+ %ret_341 = arith.xori %ret_308, %ret_340 : tensor<32x16xi32> loc(#loc196)
545
+ %new_idxs_342 = arith.xori %left_idx_332, %right_idx_333 : tensor<32x16xi32> loc(#loc197)
546
+ %new_idxs_343 = arith.select %cond_338, %new_idxs_342, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
547
+ %new_idxs_344 = arith.xori %new_idxs_311, %new_idxs_343 : tensor<32x16xi32> loc(#loc199)
548
+ %y_345 = tt.reshape %ret_341 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162)
549
+ %ileft_346 = arith.muli %y_345, %ileft : tensor<256x2x1xi32> loc(#loc164)
550
+ %ileft_347 = "tt.reduce"(%ileft_346) <{axis = 1 : i32}> ({
551
+ ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))):
552
+ %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211)
553
+ tt.reduce.return %ileft_379 : i32 loc(#loc201)
554
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201)
555
+ %ileft_348 = tt.expand_dims %ileft_347 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166)
556
+ %ileft_349 = tt.broadcast %ileft_348 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167)
557
+ %iright_350 = arith.muli %y_345, %iright : tensor<256x2x1xi32> loc(#loc168)
558
+ %iright_351 = "tt.reduce"(%iright_350) <{axis = 1 : i32}> ({
559
+ ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))):
560
+ %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212)
561
+ tt.reduce.return %iright_379 : i32 loc(#loc203)
562
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203)
563
+ %iright_352 = tt.expand_dims %iright_351 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170)
564
+ %iright_353 = tt.broadcast %iright_352 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171)
565
+ %ileft_354 = tt.reshape %ileft_349 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172)
566
+ %iright_355 = tt.reshape %iright_353 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173)
567
+ %y_idx_356 = tt.reshape %new_idxs_344 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174)
568
+ %left_idx_357 = arith.muli %y_idx_356, %ileft : tensor<256x2x1xi32> loc(#loc176)
569
+ %left_idx_358 = "tt.reduce"(%left_idx_357) <{axis = 1 : i32}> ({
570
+ ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))):
571
+ %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213)
572
+ tt.reduce.return %left_idx_379 : i32 loc(#loc206)
573
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206)
574
+ %left_idx_359 = tt.expand_dims %left_idx_358 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178)
575
+ %left_idx_360 = tt.broadcast %left_idx_359 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179)
576
+ %right_idx_361 = arith.muli %y_idx_356, %iright : tensor<256x2x1xi32> loc(#loc181)
577
+ %right_idx_362 = "tt.reduce"(%right_idx_361) <{axis = 1 : i32}> ({
578
+ ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))):
579
+ %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214)
580
+ tt.reduce.return %right_idx_379 : i32 loc(#loc209)
581
+ }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209)
582
+ %right_idx_363 = tt.expand_dims %right_idx_362 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183)
583
+ %right_idx_364 = tt.broadcast %right_idx_363 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184)
584
+ %left_idx_365 = tt.reshape %left_idx_360 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185)
585
+ %right_idx_366 = tt.reshape %right_idx_364 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186)
586
+ %cond_367 = arith.cmpi slt, %ileft_354, %iright_355 : tensor<32x16xi32> loc(#loc187)
587
+ %eq_368 = arith.cmpi eq, %ileft_354, %iright_355 : tensor<32x16xi32> loc(#loc188)
588
+ %cond_369 = arith.cmpi sgt, %left_idx_365, %right_idx_366 : tensor<32x16xi32> loc(#loc189)
589
+ %cond_370 = arith.andi %eq_368, %cond_369 : tensor<32x16xi1> loc(#loc190)
590
+ %cond_371 = arith.ori %cond_367, %cond_370 : tensor<32x16xi1> loc(#loc191)
591
+ %new_idxs_372 = arith.xori %left_idx_365, %right_idx_366 : tensor<32x16xi32> loc(#loc197)
592
+ %new_idxs_373 = arith.select %cond_371, %new_idxs_372, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198)
593
+ %new_idxs_374 = arith.xori %new_idxs_344, %new_idxs_373 : tensor<32x16xi32> loc(#loc199)
594
+ %tmp7 = arith.extsi %tmp0_19 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc149)
595
+ %tmp10_375 = arith.select %tmp0_18, %tmp7, %tmp10 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc86)
596
+ %tmp11 = "tt.reduce"(%tmp10_375) <{axis = 1 : i32}> ({
597
+ ^bb0(%tmp11_377: i64 loc(callsite(#loc1 at #loc150)), %tmp11_378: i64 loc(callsite(#loc1 at #loc150))):
598
+ %tmp11_379 = arith.addi %tmp11_377, %tmp11_378 : i64 loc(#loc200)
599
+ tt.reduce.return %tmp11_379 : i64 loc(#loc160)
600
+ }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc160)
601
+ %tmp11_376 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc151)
602
+ %tmp14 = arith.trunci %tmp11_376 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc152)
603
+ %0 = arith.muli %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc73)
604
+ %1 = tt.broadcast %r0_index_8 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc74)
605
+ %2 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc74)
606
+ %3 = arith.addi %1, %2 : tensor<32x16xi32> loc(#loc74)
607
+ %4 = tt.splat %out_ptr2 : !tt.ptr<i32> -> tensor<32x16x!tt.ptr<i32>> loc(#loc75)
608
+ %5 = tt.addptr %4, %3 : tensor<32x16x!tt.ptr<i32>>, tensor<32x16xi32> loc(#loc75)
609
+ tt.store %5, %new_idxs_374, %tmp0_18 : tensor<32x16x!tt.ptr<i32>> loc(#loc76)
610
+ %6 = tt.splat %out_ptr3 : !tt.ptr<i32> -> tensor<32x1x!tt.ptr<i32>> loc(#loc77)
611
+ %7 = tt.addptr %6, %xindex_6 : tensor<32x1x!tt.ptr<i32>>, tensor<32x1xi32> loc(#loc77)
612
+ tt.store %7, %tmp14, %xmask_7 : tensor<32x1x!tt.ptr<i32>> loc(#loc78)
613
+ tt.return loc(#loc79)
614
+ } loc(#loc)
615
+ } loc(#loc)
616
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":44:34)
617
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:49)
618
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:38)
619
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":26:21)
620
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:28)
621
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:33)
622
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:36)
623
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:44)
624
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:23)
625
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:28)
626
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:38)
627
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":33:19)
628
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":34:19)
629
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:35)
630
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:45)
631
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:30)
632
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:54)
633
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":38:19)
634
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":40:33)
635
+ #loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41)
636
+ #loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44)
637
+ #loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60)
638
+ #loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68)
639
+ #loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22)
640
+ #loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21)
641
+ #loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40)
642
+ #loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
643
+ #loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
644
+ #loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65)
645
+ #loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78)
646
+ #loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41)
647
+ #loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67)
648
+ #loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80)
649
+ #loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30)
650
+ #loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32)
651
+ #loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29)
652
+ #loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36)
653
+ #loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23)
654
+ #loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25)
655
+ #loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53)
656
+ #loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66)
657
+ #loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37)
658
+ #loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23)
659
+ #loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54)
660
+ #loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67)
661
+ #loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36)
662
+ #loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38)
663
+ #loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22)
664
+ #loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21)
665
+ #loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40)
666
+ #loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29)
667
+ #loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23)
668
+ #loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19)
669
+ #loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28)
670
+ #loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38)
671
+ #loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46)
672
+ #loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15)
673
+ #loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48)
674
+ #loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59)
675
+ #loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22)
676
+ #loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":42:19)
677
+ #loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:29)
678
+ #loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":48:21)
679
+ #loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:35)
680
+ #loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:32)
681
+ #loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:25)
682
+ #loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:47)
683
+ #loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:25)
684
+ #loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:37)
685
+ #loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:4)
686
+ #loc85 = loc(callsite(#loc1 at #loc2))
687
+ #loc86 = loc("tmp10"(#loc3))
688
+ #loc87 = loc("tmp0"(#loc4))
689
+ #loc88 = loc("tmp0"(#loc5))
690
+ #loc89 = loc("xmask"(#loc6))
691
+ #loc90 = loc("xoffset"(#loc7))
692
+ #loc91 = loc("xoffset"(#loc8))
693
+ #loc92 = loc("xindex"(#loc9))
694
+ #loc93 = loc("xindex"(#loc10))
695
+ #loc94 = loc("xindex"(#loc11))
696
+ #loc95 = loc("r0_index"(#loc12))
697
+ #loc96 = loc("r0_index"(#loc13))
698
+ #loc97 = loc("x0"(#loc14))
699
+ #loc98 = loc("x1"(#loc15))
700
+ #loc99 = loc("tmp0"(#loc16))
701
+ #loc100 = loc("tmp0"(#loc17))
702
+ #loc101 = loc("tmp0"(#loc18))
703
+ #loc102 = loc("tmp0"(#loc19))
704
+ #loc103 = loc("tmp2"(#loc20))
705
+ #loc104 = loc("tmp4"(#loc21))
706
+ #loc105 = loc("flip"(#loc22))
707
+ #loc107 = loc("flip"(#loc24))
708
+ #loc108 = loc("flip"(#loc25))
709
+ #loc109 = loc("flip"(#loc26))
710
+ #loc110 = loc("y"(#loc27))
711
+ #loc111 = loc("left_mask"(#loc29))
712
+ #loc112 = loc("ileft"(#loc30))
713
+ #loc114 = loc("ileft"(#loc34))
714
+ #loc115 = loc("ileft"(#loc35))
715
+ #loc116 = loc("iright"(#loc36))
716
+ #loc118 = loc("iright"(#loc38))
717
+ #loc119 = loc("iright"(#loc39))
718
+ #loc120 = loc("ileft"(#loc40))
719
+ #loc121 = loc("iright"(#loc41))
720
+ #loc122 = loc("y_idx"(#loc42))
721
+ #loc123 = loc("left_idx"(#loc43))
722
+ #loc124 = loc("left_idx"(#loc44))
723
+ #loc125 = loc("input"(#loc45))
724
+ #loc127 = loc("left_idx"(#loc47))
725
+ #loc128 = loc("left_idx"(#loc48))
726
+ #loc129 = loc("right_idx"(#loc49))
727
+ #loc130 = loc("right_idx"(#loc50))
728
+ #loc132 = loc("right_idx"(#loc52))
729
+ #loc133 = loc("right_idx"(#loc53))
730
+ #loc134 = loc("left_idx"(#loc54))
731
+ #loc135 = loc("right_idx"(#loc55))
732
+ #loc136 = loc("cond"(#loc56))
733
+ #loc137 = loc("eq"(#loc57))
734
+ #loc138 = loc("cond"(#loc58))
735
+ #loc139 = loc("cond"(#loc59))
736
+ #loc140 = loc("cond"(#loc60))
737
+ #loc141 = loc("cond"(#loc61))
738
+ #loc142 = loc("cond"(#loc62))
739
+ #loc143 = loc("ret"(#loc63))
740
+ #loc144 = loc("ret"(#loc64))
741
+ #loc145 = loc("ret"(#loc65))
742
+ #loc146 = loc("new_idxs"(#loc66))
743
+ #loc147 = loc("new_idxs"(#loc67))
744
+ #loc148 = loc("new_idxs"(#loc68))
745
+ #loc149 = loc("tmp7"(#loc69))
746
+ #loc151 = loc("tmp11"(#loc71))
747
+ #loc152 = loc("tmp14"(#loc72))
748
+ #loc153 = loc(callsite(#loc105 at #loc106))
749
+ #loc154 = loc(callsite(#loc107 at #loc106))
750
+ #loc155 = loc(callsite(#loc108 at #loc106))
751
+ #loc156 = loc(callsite(#loc109 at #loc106))
752
+ #loc158 = loc("cond"(#loc136))
753
+ #loc159 = loc("eq"(#loc137))
754
+ #loc160 = loc(callsite(#loc31 at #loc150))
755
+ #loc162 = loc(callsite(#loc110 at #loc157))
756
+ #loc163 = loc(callsite(#loc111 at #loc157))
757
+ #loc164 = loc(callsite(#loc112 at #loc157))
758
+ #loc166 = loc(callsite(#loc114 at #loc157))
759
+ #loc167 = loc(callsite(#loc115 at #loc157))
760
+ #loc168 = loc(callsite(#loc116 at #loc157))
761
+ #loc170 = loc(callsite(#loc118 at #loc157))
762
+ #loc171 = loc(callsite(#loc119 at #loc157))
763
+ #loc172 = loc(callsite(#loc120 at #loc157))
764
+ #loc173 = loc(callsite(#loc121 at #loc157))
765
+ #loc174 = loc(callsite(#loc122 at #loc157))
766
+ #loc175 = loc(callsite(#loc123 at #loc157))
767
+ #loc176 = loc(callsite(#loc124 at #loc157))
768
+ #loc178 = loc(callsite(#loc127 at #loc157))
769
+ #loc179 = loc(callsite(#loc128 at #loc157))
770
+ #loc180 = loc(callsite(#loc129 at #loc157))
771
+ #loc181 = loc(callsite(#loc130 at #loc157))
772
+ #loc183 = loc(callsite(#loc132 at #loc157))
773
+ #loc184 = loc(callsite(#loc133 at #loc157))
774
+ #loc185 = loc(callsite(#loc134 at #loc157))
775
+ #loc186 = loc(callsite(#loc135 at #loc157))
776
+ #loc187 = loc(callsite(#loc158 at #loc157))
777
+ #loc188 = loc(callsite(#loc159 at #loc157))
778
+ #loc189 = loc(callsite(#loc138 at #loc157))
779
+ #loc190 = loc(callsite(#loc139 at #loc157))
780
+ #loc191 = loc(callsite(#loc140 at #loc157))
781
+ #loc192 = loc(callsite(#loc141 at #loc157))
782
+ #loc193 = loc(callsite(#loc142 at #loc157))
783
+ #loc194 = loc(callsite(#loc143 at #loc157))
784
+ #loc195 = loc(callsite(#loc144 at #loc157))
785
+ #loc196 = loc(callsite(#loc145 at #loc157))
786
+ #loc197 = loc(callsite(#loc146 at #loc157))
787
+ #loc198 = loc(callsite(#loc147 at #loc157))
788
+ #loc199 = loc(callsite(#loc148 at #loc157))
789
+ #loc200 = loc(callsite(#loc33 at #loc160))
790
+ #loc201 = loc(callsite(#loc31 at #loc165))
791
+ #loc203 = loc(callsite(#loc31 at #loc169))
792
+ #loc205 = loc(callsite(#loc125 at #loc177))
793
+ #loc206 = loc(callsite(#loc31 at #loc177))
794
+ #loc208 = loc(callsite(#loc125 at #loc182))
795
+ #loc209 = loc(callsite(#loc31 at #loc182))
796
+ #loc211 = loc(callsite(#loc33 at #loc201))
797
+ #loc212 = loc(callsite(#loc33 at #loc203))
798
+ #loc213 = loc(callsite(#loc33 at #loc206))
799
+ #loc214 = loc(callsite(#loc33 at #loc209))
SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.cubin ADDED
Binary file (15.1 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.llir ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @global_smem = external addrspace(3) global [0 x i8], align 16
6
+
7
+ ; Function Attrs: nounwind
8
+ define ptx_kernel void @triton_per_fused__to_copy_mul_sum_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 {
9
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
10
+ %9 = shl i32 %8, 7, !dbg !8
11
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
12
+ %11 = and i32 %10, 127, !dbg !9
13
+ %12 = or disjoint i32 %9, %11, !dbg !10
14
+ %13 = icmp slt i32 %12, %3, !dbg !11
15
+ %14 = lshr i32 %10, 7, !dbg !12
16
+ %.lobit = and i32 %14, 1, !dbg !12
17
+ %15 = or disjoint i32 %.lobit, 2, !dbg !12
18
+ %16 = or disjoint i32 %.lobit, 4, !dbg !12
19
+ %17 = or i32 %14, 6, !dbg !12
20
+ %18 = or disjoint i32 %.lobit, 8, !dbg !12
21
+ %19 = or disjoint i32 %.lobit, 10, !dbg !12
22
+ %20 = or disjoint i32 %.lobit, 12, !dbg !12
23
+ %21 = or i32 %14, 14, !dbg !12
24
+ %22 = or disjoint i32 %.lobit, 16, !dbg !12
25
+ %23 = or disjoint i32 %.lobit, 18, !dbg !12
26
+ %24 = or disjoint i32 %.lobit, 20, !dbg !12
27
+ %25 = or i32 %14, 22, !dbg !12
28
+ %26 = or disjoint i32 %.lobit, 24, !dbg !12
29
+ %27 = or disjoint i32 %.lobit, 26, !dbg !12
30
+ %28 = or disjoint i32 %.lobit, 28, !dbg !12
31
+ %29 = or i32 %14, 30, !dbg !12
32
+ %30 = zext nneg i32 %.lobit to i64, !dbg !13
33
+ %31 = zext nneg i32 %15 to i64, !dbg !13
34
+ %32 = zext nneg i32 %16 to i64, !dbg !13
35
+ %33 = zext nneg i32 %17 to i64, !dbg !13
36
+ %34 = zext nneg i32 %18 to i64, !dbg !13
37
+ %35 = zext nneg i32 %19 to i64, !dbg !13
38
+ %36 = zext nneg i32 %20 to i64, !dbg !13
39
+ %37 = zext nneg i32 %21 to i64, !dbg !13
40
+ %38 = zext nneg i32 %22 to i64, !dbg !13
41
+ %39 = zext nneg i32 %23 to i64, !dbg !13
42
+ %40 = zext nneg i32 %24 to i64, !dbg !13
43
+ %41 = zext nneg i32 %25 to i64, !dbg !13
44
+ %42 = zext nneg i32 %26 to i64, !dbg !13
45
+ %43 = zext nneg i32 %27 to i64, !dbg !13
46
+ %44 = zext nneg i32 %28 to i64, !dbg !13
47
+ %45 = zext nneg i32 %29 to i64, !dbg !13
48
+ %46 = mul nuw nsw i64 %2, %30, !dbg !13
49
+ %47 = mul i64 %2, %31, !dbg !13
50
+ %48 = mul i64 %2, %32, !dbg !13
51
+ %49 = mul i64 %2, %33, !dbg !13
52
+ %50 = mul i64 %2, %34, !dbg !13
53
+ %51 = mul i64 %2, %35, !dbg !13
54
+ %52 = mul i64 %2, %36, !dbg !13
55
+ %53 = mul i64 %2, %37, !dbg !13
56
+ %54 = mul i64 %2, %38, !dbg !13
57
+ %55 = mul i64 %2, %39, !dbg !13
58
+ %56 = mul i64 %2, %40, !dbg !13
59
+ %57 = mul i64 %2, %41, !dbg !13
60
+ %58 = mul i64 %2, %42, !dbg !13
61
+ %59 = mul i64 %2, %43, !dbg !13
62
+ %60 = mul i64 %2, %44, !dbg !13
63
+ %61 = mul i64 %2, %45, !dbg !13
64
+ %62 = sext i32 %12 to i64, !dbg !14
65
+ %63 = getelementptr float, ptr addrspace(1) %0, i64 %46, !dbg !15
66
+ %64 = getelementptr float, ptr addrspace(1) %63, i64 %62, !dbg !15
67
+ %65 = getelementptr float, ptr addrspace(1) %0, i64 %47, !dbg !15
68
+ %66 = getelementptr float, ptr addrspace(1) %65, i64 %62, !dbg !15
69
+ %67 = getelementptr float, ptr addrspace(1) %0, i64 %48, !dbg !15
70
+ %68 = getelementptr float, ptr addrspace(1) %67, i64 %62, !dbg !15
71
+ %69 = getelementptr float, ptr addrspace(1) %0, i64 %49, !dbg !15
72
+ %70 = getelementptr float, ptr addrspace(1) %69, i64 %62, !dbg !15
73
+ %71 = getelementptr float, ptr addrspace(1) %0, i64 %50, !dbg !15
74
+ %72 = getelementptr float, ptr addrspace(1) %71, i64 %62, !dbg !15
75
+ %73 = getelementptr float, ptr addrspace(1) %0, i64 %51, !dbg !15
76
+ %74 = getelementptr float, ptr addrspace(1) %73, i64 %62, !dbg !15
77
+ %75 = getelementptr float, ptr addrspace(1) %0, i64 %52, !dbg !15
78
+ %76 = getelementptr float, ptr addrspace(1) %75, i64 %62, !dbg !15
79
+ %77 = getelementptr float, ptr addrspace(1) %0, i64 %53, !dbg !15
80
+ %78 = getelementptr float, ptr addrspace(1) %77, i64 %62, !dbg !15
81
+ %79 = getelementptr float, ptr addrspace(1) %0, i64 %54, !dbg !15
82
+ %80 = getelementptr float, ptr addrspace(1) %79, i64 %62, !dbg !15
83
+ %81 = getelementptr float, ptr addrspace(1) %0, i64 %55, !dbg !15
84
+ %82 = getelementptr float, ptr addrspace(1) %81, i64 %62, !dbg !15
85
+ %83 = getelementptr float, ptr addrspace(1) %0, i64 %56, !dbg !15
86
+ %84 = getelementptr float, ptr addrspace(1) %83, i64 %62, !dbg !15
87
+ %85 = getelementptr float, ptr addrspace(1) %0, i64 %57, !dbg !15
88
+ %86 = getelementptr float, ptr addrspace(1) %85, i64 %62, !dbg !15
89
+ %87 = getelementptr float, ptr addrspace(1) %0, i64 %58, !dbg !15
90
+ %88 = getelementptr float, ptr addrspace(1) %87, i64 %62, !dbg !15
91
+ %89 = getelementptr float, ptr addrspace(1) %0, i64 %59, !dbg !15
92
+ %90 = getelementptr float, ptr addrspace(1) %89, i64 %62, !dbg !15
93
+ %91 = getelementptr float, ptr addrspace(1) %0, i64 %60, !dbg !15
94
+ %92 = getelementptr float, ptr addrspace(1) %91, i64 %62, !dbg !15
95
+ %93 = getelementptr float, ptr addrspace(1) %0, i64 %61, !dbg !15
96
+ %94 = getelementptr float, ptr addrspace(1) %93, i64 %62, !dbg !15
97
+ %95 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %64, i1 %13) #4, !dbg !16
98
+ %96 = bitcast i32 %95 to float, !dbg !16
99
+ %97 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %66, i1 %13) #4, !dbg !16
100
+ %98 = bitcast i32 %97 to float, !dbg !16
101
+ %99 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %68, i1 %13) #4, !dbg !16
102
+ %100 = bitcast i32 %99 to float, !dbg !16
103
+ %101 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %70, i1 %13) #4, !dbg !16
104
+ %102 = bitcast i32 %101 to float, !dbg !16
105
+ %103 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %72, i1 %13) #4, !dbg !16
106
+ %104 = bitcast i32 %103 to float, !dbg !16
107
+ %105 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %74, i1 %13) #4, !dbg !16
108
+ %106 = bitcast i32 %105 to float, !dbg !16
109
+ %107 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %76, i1 %13) #4, !dbg !16
110
+ %108 = bitcast i32 %107 to float, !dbg !16
111
+ %109 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %78, i1 %13) #4, !dbg !16
112
+ %110 = bitcast i32 %109 to float, !dbg !16
113
+ %111 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %80, i1 %13) #4, !dbg !16
114
+ %112 = bitcast i32 %111 to float, !dbg !16
115
+ %113 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %82, i1 %13) #4, !dbg !16
116
+ %114 = bitcast i32 %113 to float, !dbg !16
117
+ %115 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %84, i1 %13) #4, !dbg !16
118
+ %116 = bitcast i32 %115 to float, !dbg !16
119
+ %117 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %86, i1 %13) #4, !dbg !16
120
+ %118 = bitcast i32 %117 to float, !dbg !16
121
+ %119 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %88, i1 %13) #4, !dbg !16
122
+ %120 = bitcast i32 %119 to float, !dbg !16
123
+ %121 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %90, i1 %13) #4, !dbg !16
124
+ %122 = bitcast i32 %121 to float, !dbg !16
125
+ %123 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %92, i1 %13) #4, !dbg !16
126
+ %124 = bitcast i32 %123 to float, !dbg !16
127
+ %125 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %94, i1 %13) #4, !dbg !16
128
+ %126 = bitcast i32 %125 to float, !dbg !16
129
+ %127 = fadd float %96, %98, !dbg !17
130
+ %128 = fadd float %127, %100, !dbg !17
131
+ %129 = fadd float %128, %102, !dbg !17
132
+ %130 = fadd float %129, %104, !dbg !17
133
+ %131 = fadd float %130, %106, !dbg !17
134
+ %132 = fadd float %131, %108, !dbg !17
135
+ %133 = fadd float %132, %110, !dbg !17
136
+ %134 = fadd float %133, %112, !dbg !17
137
+ %135 = fadd float %134, %114, !dbg !17
138
+ %136 = fadd float %135, %116, !dbg !17
139
+ %137 = fadd float %136, %118, !dbg !17
140
+ %138 = fadd float %137, %120, !dbg !17
141
+ %139 = fadd float %138, %122, !dbg !17
142
+ %140 = fadd float %139, %124, !dbg !17
143
+ %141 = fadd float %140, %126, !dbg !17
144
+ %.idx = shl nuw nsw i32 %11, 3, !dbg !21
145
+ %142 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !21
146
+ %143 = getelementptr float, ptr addrspace(3) %142, i32 %.lobit, !dbg !21
147
+ %144 = bitcast float %141 to i32, !dbg !21
148
+ %145 = select i1 %13, i32 %144, i32 0, !dbg !17
149
+ %146 = insertelement <1 x i32> poison, i32 %145, i64 0, !dbg !21
150
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %143, <1 x i32> %146, i1 true) #4, !dbg !21
151
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21
152
+ %147 = icmp samesign ult i32 %10, 256, !dbg !21
153
+ %148 = getelementptr float, ptr addrspace(3) @global_smem, i32 %10, !dbg !21
154
+ %149 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %148, i1 %147) #4, !dbg !21
155
+ %150 = bitcast i32 %149 to float, !dbg !21
156
+ %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 1, i32 31), !dbg !21
157
+ %152 = bitcast i32 %151 to float, !dbg !21
158
+ %153 = fadd float %150, %152, !dbg !17
159
+ %154 = and i32 %10, 769, !dbg !21
160
+ %155 = icmp eq i32 %154, 0, !dbg !21
161
+ %156 = bitcast float %153 to <1 x i32>, !dbg !21
162
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %148, <1 x i32> %156, i1 %155) #4, !dbg !21
163
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21
164
+ %157 = load float, ptr addrspace(3) %142, align 8, !dbg !21
165
+ %158 = getelementptr bfloat, ptr addrspace(1) %1, i64 %62, !dbg !22
166
+ %159 = fptrunc float %157 to bfloat, !dbg !23
167
+ %160 = and i32 %10, 128, !dbg !23
168
+ %161 = icmp eq i32 %160, 0, !dbg !23
169
+ %162 = bitcast bfloat %159 to i16, !dbg !23
170
+ %163 = and i1 %161, %13, !dbg !23
171
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %162, ptr addrspace(1) %158, i1 %163) #4, !dbg !23
172
+ ret void, !dbg !24
173
+ }
174
+
175
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
176
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
177
+
178
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
179
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
180
+
181
+ ; Function Attrs: convergent nocallback nounwind
182
+ declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
183
+
184
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
185
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
186
+
187
+ attributes #0 = { nounwind "nvvm.reqntid"="256" }
188
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
189
+ attributes #2 = { convergent nocallback nounwind }
190
+ attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
191
+ attributes #4 = { nounwind }
192
+
193
+ !llvm.dbg.cu = !{!0}
194
+ !llvm.module.flags = !{!2, !3}
195
+
196
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
197
+ !1 = !DIFile(filename: "cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf")
198
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
199
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
200
+ !4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_mul_sum_1", linkageName: "triton_per_fused__to_copy_mul_sum_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
201
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
202
+ !6 = !{}
203
+ !7 = !DILocation(line: 23, column: 28, scope: !4)
204
+ !8 = !DILocation(line: 23, column: 33, scope: !4)
205
+ !9 = !DILocation(line: 24, column: 44, scope: !4)
206
+ !10 = !DILocation(line: 24, column: 23, scope: !4)
207
+ !11 = !DILocation(line: 25, column: 21, scope: !4)
208
+ !12 = !DILocation(line: 26, column: 38, scope: !4)
209
+ !13 = !DILocation(line: 33, column: 39, scope: !4)
210
+ !14 = !DILocation(line: 33, column: 35, scope: !4)
211
+ !15 = !DILocation(line: 33, column: 30, scope: !4)
212
+ !16 = !DILocation(line: 33, column: 46, scope: !4)
213
+ !17 = !DILocation(line: 261, column: 15, scope: !18, inlinedAt: !20)
214
+ !18 = distinct !DILexicalBlockFile(scope: !4, file: !19, discriminator: 0)
215
+ !19 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language")
216
+ !20 = !DILocation(line: 36, column: 24, scope: !4)
217
+ !21 = !DILocation(line: 291, column: 36, scope: !18, inlinedAt: !20)
218
+ !22 = !DILocation(line: 37, column: 25, scope: !4)
219
+ !23 = !DILocation(line: 37, column: 36, scope: !4)
220
+ !24 = !DILocation(line: 37, column: 4, scope: !4)
SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.ptx ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_per_fused__to_copy_mul_sum_1 // -- Begin function triton_per_fused__to_copy_mul_sum_1
10
+ .extern .shared .align 16 .b8 global_smem[];
11
+ // @triton_per_fused__to_copy_mul_sum_1
12
+ .visible .entry triton_per_fused__to_copy_mul_sum_1(
13
+ .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_mul_sum_1_param_0,
14
+ .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_mul_sum_1_param_1,
15
+ .param .u64 triton_per_fused__to_copy_mul_sum_1_param_2,
16
+ .param .u32 triton_per_fused__to_copy_mul_sum_1_param_3,
17
+ .param .u32 triton_per_fused__to_copy_mul_sum_1_param_4,
18
+ .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_mul_sum_1_param_5,
19
+ .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_mul_sum_1_param_6
20
+ )
21
+ .reqntid 256
22
+ {
23
+ .reg .pred %p<22>;
24
+ .reg .b16 %rs<2>;
25
+ .reg .b32 %r<86>;
26
+ .reg .b64 %rd<86>;
27
+ .loc 1 18 0 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:18:0
28
+ $L__func_begin0:
29
+ .loc 1 18 0 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:18:0
30
+
31
+ // %bb.0:
32
+ ld.param.b64 %rd18, [triton_per_fused__to_copy_mul_sum_1_param_0];
33
+ ld.param.b64 %rd19, [triton_per_fused__to_copy_mul_sum_1_param_1];
34
+ $L__tmp0:
35
+ .loc 1 23 28 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:23:28
36
+ mov.u32 %r39, %ctaid.x;
37
+ .loc 1 23 33 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:23:33
38
+ shl.b32 %r40, %r39, 7;
39
+ ld.param.b64 %rd20, [triton_per_fused__to_copy_mul_sum_1_param_2];
40
+ ld.param.b32 %r41, [triton_per_fused__to_copy_mul_sum_1_param_3];
41
+ .loc 1 24 44 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:24:44
42
+ mov.u32 %r42, %tid.x;
43
+ and.b32 %r43, %r42, 127;
44
+ .loc 1 24 23 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:24:23
45
+ or.b32 %r44, %r40, %r43;
46
+ .loc 1 25 21 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:25:21
47
+ setp.lt.s32 %p1, %r44, %r41;
48
+ .loc 1 26 38 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:26:38
49
+ shr.u32 %r45, %r42, 7;
50
+ bfe.u32 %r46, %r42, 7, 1;
51
+ or.b32 %r47, %r46, 2;
52
+ or.b32 %r48, %r46, 4;
53
+ or.b32 %r49, %r45, 6;
54
+ or.b32 %r50, %r46, 8;
55
+ or.b32 %r51, %r46, 10;
56
+ or.b32 %r52, %r46, 12;
57
+ or.b32 %r53, %r45, 14;
58
+ or.b32 %r54, %r46, 16;
59
+ or.b32 %r55, %r46, 18;
60
+ or.b32 %r56, %r46, 20;
61
+ or.b32 %r57, %r45, 22;
62
+ or.b32 %r58, %r46, 24;
63
+ or.b32 %r59, %r46, 26;
64
+ or.b32 %r60, %r46, 28;
65
+ or.b32 %r61, %r45, 30;
66
+ .loc 1 33 39 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:33:39
67
+ cvt.u64.u32 %rd21, %r46;
68
+ cvt.u64.u32 %rd22, %r47;
69
+ cvt.u64.u32 %rd23, %r48;
70
+ cvt.u64.u32 %rd24, %r49;
71
+ cvt.u64.u32 %rd25, %r50;
72
+ cvt.u64.u32 %rd26, %r51;
73
+ cvt.u64.u32 %rd27, %r52;
74
+ cvt.u64.u32 %rd28, %r53;
75
+ cvt.u64.u32 %rd29, %r54;
76
+ cvt.u64.u32 %rd30, %r55;
77
+ cvt.u64.u32 %rd31, %r56;
78
+ cvt.u64.u32 %rd32, %r57;
79
+ cvt.u64.u32 %rd33, %r58;
80
+ cvt.u64.u32 %rd34, %r59;
81
+ cvt.u64.u32 %rd35, %r60;
82
+ cvt.u64.u32 %rd36, %r61;
83
+ mul.lo.s64 %rd37, %rd20, %rd21;
84
+ mul.lo.s64 %rd38, %rd20, %rd22;
85
+ mul.lo.s64 %rd39, %rd20, %rd23;
86
+ mul.lo.s64 %rd40, %rd20, %rd24;
87
+ mul.lo.s64 %rd41, %rd20, %rd25;
88
+ mul.lo.s64 %rd42, %rd20, %rd26;
89
+ mul.lo.s64 %rd43, %rd20, %rd27;
90
+ mul.lo.s64 %rd44, %rd20, %rd28;
91
+ mul.lo.s64 %rd45, %rd20, %rd29;
92
+ mul.lo.s64 %rd46, %rd20, %rd30;
93
+ mul.lo.s64 %rd47, %rd20, %rd31;
94
+ mul.lo.s64 %rd48, %rd20, %rd32;
95
+ mul.lo.s64 %rd49, %rd20, %rd33;
96
+ mul.lo.s64 %rd50, %rd20, %rd34;
97
+ mul.lo.s64 %rd51, %rd20, %rd35;
98
+ mul.lo.s64 %rd52, %rd20, %rd36;
99
+ .loc 1 33 30 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:33:30
100
+ shl.b64 %rd53, %rd37, 2;
101
+ add.s64 %rd54, %rd18, %rd53;
102
+ mul.wide.s32 %rd55, %r44, 4;
103
+ add.s64 %rd1, %rd54, %rd55;
104
+ shl.b64 %rd56, %rd38, 2;
105
+ add.s64 %rd57, %rd18, %rd56;
106
+ add.s64 %rd2, %rd57, %rd55;
107
+ shl.b64 %rd58, %rd39, 2;
108
+ add.s64 %rd59, %rd18, %rd58;
109
+ add.s64 %rd3, %rd59, %rd55;
110
+ shl.b64 %rd60, %rd40, 2;
111
+ add.s64 %rd61, %rd18, %rd60;
112
+ add.s64 %rd4, %rd61, %rd55;
113
+ shl.b64 %rd62, %rd41, 2;
114
+ add.s64 %rd63, %rd18, %rd62;
115
+ add.s64 %rd5, %rd63, %rd55;
116
+ shl.b64 %rd64, %rd42, 2;
117
+ add.s64 %rd65, %rd18, %rd64;
118
+ add.s64 %rd6, %rd65, %rd55;
119
+ shl.b64 %rd66, %rd43, 2;
120
+ add.s64 %rd67, %rd18, %rd66;
121
+ add.s64 %rd7, %rd67, %rd55;
122
+ shl.b64 %rd68, %rd44, 2;
123
+ add.s64 %rd69, %rd18, %rd68;
124
+ add.s64 %rd8, %rd69, %rd55;
125
+ shl.b64 %rd70, %rd45, 2;
126
+ add.s64 %rd71, %rd18, %rd70;
127
+ add.s64 %rd9, %rd71, %rd55;
128
+ shl.b64 %rd72, %rd46, 2;
129
+ add.s64 %rd73, %rd18, %rd72;
130
+ add.s64 %rd10, %rd73, %rd55;
131
+ shl.b64 %rd74, %rd47, 2;
132
+ add.s64 %rd75, %rd18, %rd74;
133
+ add.s64 %rd11, %rd75, %rd55;
134
+ shl.b64 %rd76, %rd48, 2;
135
+ add.s64 %rd77, %rd18, %rd76;
136
+ add.s64 %rd12, %rd77, %rd55;
137
+ shl.b64 %rd78, %rd49, 2;
138
+ add.s64 %rd79, %rd18, %rd78;
139
+ add.s64 %rd13, %rd79, %rd55;
140
+ shl.b64 %rd80, %rd50, 2;
141
+ add.s64 %rd81, %rd18, %rd80;
142
+ add.s64 %rd14, %rd81, %rd55;
143
+ shl.b64 %rd82, %rd51, 2;
144
+ add.s64 %rd83, %rd18, %rd82;
145
+ add.s64 %rd15, %rd83, %rd55;
146
+ shl.b64 %rd84, %rd52, 2;
147
+ add.s64 %rd85, %rd18, %rd84;
148
+ add.s64 %rd16, %rd85, %rd55;
149
+ mov.b32 %r2, 0;
150
+ .loc 1 33 46 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:33:46
151
+ // begin inline asm
152
+ mov.u32 %r1, %r2;
153
+ @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];
154
+ // end inline asm
155
+ // begin inline asm
156
+ mov.u32 %r3, %r2;
157
+ @%p1 ld.global.b32 { %r3 }, [ %rd2 + 0 ];
158
+ // end inline asm
159
+ // begin inline asm
160
+ mov.u32 %r5, %r2;
161
+ @%p1 ld.global.b32 { %r5 }, [ %rd3 + 0 ];
162
+ // end inline asm
163
+ // begin inline asm
164
+ mov.u32 %r7, %r2;
165
+ @%p1 ld.global.b32 { %r7 }, [ %rd4 + 0 ];
166
+ // end inline asm
167
+ // begin inline asm
168
+ mov.u32 %r9, %r2;
169
+ @%p1 ld.global.b32 { %r9 }, [ %rd5 + 0 ];
170
+ // end inline asm
171
+ // begin inline asm
172
+ mov.u32 %r11, %r2;
173
+ @%p1 ld.global.b32 { %r11 }, [ %rd6 + 0 ];
174
+ // end inline asm
175
+ // begin inline asm
176
+ mov.u32 %r13, %r2;
177
+ @%p1 ld.global.b32 { %r13 }, [ %rd7 + 0 ];
178
+ // end inline asm
179
+ // begin inline asm
180
+ mov.u32 %r15, %r2;
181
+ @%p1 ld.global.b32 { %r15 }, [ %rd8 + 0 ];
182
+ // end inline asm
183
+ // begin inline asm
184
+ mov.u32 %r17, %r2;
185
+ @%p1 ld.global.b32 { %r17 }, [ %rd9 + 0 ];
186
+ // end inline asm
187
+ // begin inline asm
188
+ mov.u32 %r19, %r2;
189
+ @%p1 ld.global.b32 { %r19 }, [ %rd10 + 0 ];
190
+ // end inline asm
191
+ // begin inline asm
192
+ mov.u32 %r21, %r2;
193
+ @%p1 ld.global.b32 { %r21 }, [ %rd11 + 0 ];
194
+ // end inline asm
195
+ // begin inline asm
196
+ mov.u32 %r23, %r2;
197
+ @%p1 ld.global.b32 { %r23 }, [ %rd12 + 0 ];
198
+ // end inline asm
199
+ // begin inline asm
200
+ mov.u32 %r25, %r2;
201
+ @%p1 ld.global.b32 { %r25 }, [ %rd13 + 0 ];
202
+ // end inline asm
203
+ // begin inline asm
204
+ mov.u32 %r27, %r2;
205
+ @%p1 ld.global.b32 { %r27 }, [ %rd14 + 0 ];
206
+ // end inline asm
207
+ // begin inline asm
208
+ mov.u32 %r29, %r2;
209
+ @%p1 ld.global.b32 { %r29 }, [ %rd15 + 0 ];
210
+ // end inline asm
211
+ // begin inline asm
212
+ mov.u32 %r31, %r2;
213
+ @%p1 ld.global.b32 { %r31 }, [ %rd16 + 0 ];
214
+ // end inline asm
215
+ $L__tmp1:
216
+ .loc 2 261 15 // standard.py:261:15 @[ cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:36:24 ]
217
+ add.f32 %r62, %r1, %r3;
218
+ add.f32 %r63, %r62, %r5;
219
+ add.f32 %r64, %r63, %r7;
220
+ add.f32 %r65, %r64, %r9;
221
+ add.f32 %r66, %r65, %r11;
222
+ add.f32 %r67, %r66, %r13;
223
+ add.f32 %r68, %r67, %r15;
224
+ add.f32 %r69, %r68, %r17;
225
+ add.f32 %r70, %r69, %r19;
226
+ add.f32 %r71, %r70, %r21;
227
+ add.f32 %r72, %r71, %r23;
228
+ add.f32 %r73, %r72, %r25;
229
+ add.f32 %r74, %r73, %r27;
230
+ add.f32 %r75, %r74, %r29;
231
+ add.f32 %r76, %r75, %r31;
232
+ .loc 2 291 36 // standard.py:291:36 @[ cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:36:24 ]
233
+ shl.b32 %r77, %r43, 3;
234
+ mov.b32 %r78, global_smem;
235
+ add.s32 %r79, %r78, %r77;
236
+ shl.b32 %r80, %r46, 2;
237
+ add.s32 %r33, %r79, %r80;
238
+ .loc 2 261 15 // standard.py:261:15 @[ cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:36:24 ]
239
+ selp.b32 %r34, %r76, 0, %p1;
240
+ mov.pred %p17, -1;
241
+ .loc 2 291 36 // standard.py:291:36 @[ cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:36:24 ]
242
+ // begin inline asm
243
+ @%p17 st.shared.b32 [ %r33 + 0 ], %r34;
244
+ // end inline asm
245
+ bar.sync 0;
246
+ setp.lt.u32 %p18, %r42, 256;
247
+ shl.b32 %r81, %r42, 2;
248
+ add.s32 %r36, %r78, %r81;
249
+ // begin inline asm
250
+ @%p18 ld.shared.b32 %r35, [ %r36 + 0 ];
251
+ // end inline asm
252
+ shfl.sync.bfly.b32 %r82, %r35, 1, 31, -1;
253
+ .loc 2 261 15 // standard.py:261:15 @[ cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:36:24 ]
254
+ add.f32 %r38, %r35, %r82;
255
+ .loc 2 291 36 // standard.py:291:36 @[ cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:36:24 ]
256
+ and.b32 %r83, %r42, 769;
257
+ setp.eq.b32 %p19, %r83, 0;
258
+ // begin inline asm
259
+ @%p19 st.shared.b32 [ %r36 + 0 ], %r38;
260
+ // end inline asm
261
+ bar.sync 0;
262
+ ld.shared.b32 %r84, [%r79];
263
+ $L__tmp2:
264
+ .loc 1 37 25 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:37:25
265
+ mad.wide.s32 %rd17, %r44, 2, %rd19;
266
+ .loc 1 37 36 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:37:36
267
+ cvt.rn.bf16.f32 %rs1, %r84;
268
+ and.b32 %r85, %r42, 128;
269
+ setp.eq.b32 %p21, %r85, 0;
270
+ and.pred %p20, %p21, %p1;
271
+ // begin inline asm
272
+ @%p20 st.global.b16 [ %rd17 + 0 ], { %rs1 };
273
+ // end inline asm
274
+ .loc 1 37 4 // cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py:37:4
275
+ ret;
276
+ $L__tmp3:
277
+ $L__func_end0:
278
+ // -- End function
279
+ }
280
+ .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py"
281
+ .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py"
282
+ .section .debug_abbrev
283
+ {
284
+ .b8 1 // Abbreviation Code
285
+ .b8 17 // DW_TAG_compile_unit
286
+ .b8 1 // DW_CHILDREN_yes
287
+ .b8 37 // DW_AT_producer
288
+ .b8 8 // DW_FORM_string
289
+ .b8 19 // DW_AT_language
290
+ .b8 5 // DW_FORM_data2
291
+ .b8 3 // DW_AT_name
292
+ .b8 8 // DW_FORM_string
293
+ .b8 16 // DW_AT_stmt_list
294
+ .b8 6 // DW_FORM_data4
295
+ .b8 27 // DW_AT_comp_dir
296
+ .b8 8 // DW_FORM_string
297
+ .b8 0 // EOM(1)
298
+ .b8 0 // EOM(2)
299
+ .b8 2 // Abbreviation Code
300
+ .b8 46 // DW_TAG_subprogram
301
+ .b8 0 // DW_CHILDREN_no
302
+ .b8 3 // DW_AT_name
303
+ .b8 8 // DW_FORM_string
304
+ .b8 32 // DW_AT_inline
305
+ .b8 11 // DW_FORM_data1
306
+ .b8 0 // EOM(1)
307
+ .b8 0 // EOM(2)
308
+ .b8 3 // Abbreviation Code
309
+ .b8 46 // DW_TAG_subprogram
310
+ .b8 1 // DW_CHILDREN_yes
311
+ .b8 17 // DW_AT_low_pc
312
+ .b8 1 // DW_FORM_addr
313
+ .b8 18 // DW_AT_high_pc
314
+ .b8 1 // DW_FORM_addr
315
+ .b8 49 // DW_AT_abstract_origin
316
+ .b8 19 // DW_FORM_ref4
317
+ .b8 0 // EOM(1)
318
+ .b8 0 // EOM(2)
319
+ .b8 4 // Abbreviation Code
320
+ .b8 29 // DW_TAG_inlined_subroutine
321
+ .b8 0 // DW_CHILDREN_no
322
+ .b8 49 // DW_AT_abstract_origin
323
+ .b8 19 // DW_FORM_ref4
324
+ .b8 17 // DW_AT_low_pc
325
+ .b8 1 // DW_FORM_addr
326
+ .b8 18 // DW_AT_high_pc
327
+ .b8 1 // DW_FORM_addr
328
+ .b8 88 // DW_AT_call_file
329
+ .b8 11 // DW_FORM_data1
330
+ .b8 89 // DW_AT_call_line
331
+ .b8 11 // DW_FORM_data1
332
+ .b8 87 // DW_AT_call_column
333
+ .b8 11 // DW_FORM_data1
334
+ .b8 0 // EOM(1)
335
+ .b8 0 // EOM(2)
336
+ .b8 0 // EOM(3)
337
+ }
338
+ .section .debug_info
339
+ {
340
+ .b32 220 // Length of Unit
341
+ .b8 2 // DWARF version number
342
+ .b8 0
343
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
344
+ .b8 8 // Address Size (in bytes)
345
+ .b8 1 // Abbrev [1] 0xb:0xd5 DW_TAG_compile_unit
346
+ .b8 116 // DW_AT_producer
347
+ .b8 114
348
+ .b8 105
349
+ .b8 116
350
+ .b8 111
351
+ .b8 110
352
+ .b8 0
353
+ .b8 2 // DW_AT_language
354
+ .b8 0
355
+ .b8 99 // DW_AT_name
356
+ .b8 121
357
+ .b8 102
358
+ .b8 116
359
+ .b8 55
360
+ .b8 115
361
+ .b8 105
362
+ .b8 97
363
+ .b8 108
364
+ .b8 101
365
+ .b8 112
366
+ .b8 114
367
+ .b8 105
368
+ .b8 119
369
+ .b8 54
370
+ .b8 101
371
+ .b8 117
372
+ .b8 106
373
+ .b8 117
374
+ .b8 108
375
+ .b8 97
376
+ .b8 120
377
+ .b8 112
378
+ .b8 105
379
+ .b8 53
380
+ .b8 55
381
+ .b8 113
382
+ .b8 108
383
+ .b8 114
384
+ .b8 97
385
+ .b8 102
386
+ .b8 107
387
+ .b8 109
388
+ .b8 112
389
+ .b8 52
390
+ .b8 107
391
+ .b8 50
392
+ .b8 107
393
+ .b8 106
394
+ .b8 119
395
+ .b8 122
396
+ .b8 119
397
+ .b8 52
398
+ .b8 110
399
+ .b8 111
400
+ .b8 104
401
+ .b8 50
402
+ .b8 51
403
+ .b8 100
404
+ .b8 100
405
+ .b8 122
406
+ .b8 54
407
+ .b8 46
408
+ .b8 112
409
+ .b8 121
410
+ .b8 0
411
+ .b32 .debug_line // DW_AT_stmt_list
412
+ .b8 47 // DW_AT_comp_dir
413
+ .b8 119
414
+ .b8 111
415
+ .b8 114
416
+ .b8 107
417
+ .b8 115
418
+ .b8 112
419
+ .b8 97
420
+ .b8 99
421
+ .b8 101
422
+ .b8 47
423
+ .b8 104
424
+ .b8 97
425
+ .b8 110
426
+ .b8 114
427
+ .b8 117
428
+ .b8 105
429
+ .b8 47
430
+ .b8 83
431
+ .b8 112
432
+ .b8 101
433
+ .b8 99
434
+ .b8 70
435
+ .b8 111
436
+ .b8 114
437
+ .b8 103
438
+ .b8 101
439
+ .b8 45
440
+ .b8 101
441
+ .b8 120
442
+ .b8 116
443
+ .b8 47
444
+ .b8 99
445
+ .b8 97
446
+ .b8 99
447
+ .b8 104
448
+ .b8 101
449
+ .b8 47
450
+ .b8 99
451
+ .b8 111
452
+ .b8 109
453
+ .b8 112
454
+ .b8 105
455
+ .b8 108
456
+ .b8 101
457
+ .b8 100
458
+ .b8 95
459
+ .b8 107
460
+ .b8 101
461
+ .b8 114
462
+ .b8 110
463
+ .b8 101
464
+ .b8 108
465
+ .b8 115
466
+ .b8 47
467
+ .b8 121
468
+ .b8 102
469
+ .b8 0
470
+ .b8 2 // Abbrev [2] 0x8b:0x26 DW_TAG_subprogram
471
+ .b8 116 // DW_AT_name
472
+ .b8 114
473
+ .b8 105
474
+ .b8 116
475
+ .b8 111
476
+ .b8 110
477
+ .b8 95
478
+ .b8 112
479
+ .b8 101
480
+ .b8 114
481
+ .b8 95
482
+ .b8 102
483
+ .b8 117
484
+ .b8 115
485
+ .b8 101
486
+ .b8 100
487
+ .b8 95
488
+ .b8 95
489
+ .b8 116
490
+ .b8 111
491
+ .b8 95
492
+ .b8 99
493
+ .b8 111
494
+ .b8 112
495
+ .b8 121
496
+ .b8 95
497
+ .b8 109
498
+ .b8 117
499
+ .b8 108
500
+ .b8 95
501
+ .b8 115
502
+ .b8 117
503
+ .b8 109
504
+ .b8 95
505
+ .b8 49
506
+ .b8 0
507
+ .b8 1 // DW_AT_inline
508
+ .b8 3 // Abbrev [3] 0xb1:0x2e DW_TAG_subprogram
509
+ .b64 $L__func_begin0 // DW_AT_low_pc
510
+ .b64 $L__func_end0 // DW_AT_high_pc
511
+ .b32 139 // DW_AT_abstract_origin
512
+ .b8 4 // Abbrev [4] 0xc6:0x18 DW_TAG_inlined_subroutine
513
+ .b32 139 // DW_AT_abstract_origin
514
+ .b64 $L__tmp1 // DW_AT_low_pc
515
+ .b64 $L__tmp2 // DW_AT_high_pc
516
+ .b8 1 // DW_AT_call_file
517
+ .b8 36 // DW_AT_call_line
518
+ .b8 24 // DW_AT_call_column
519
+ .b8 0 // End Of Children Mark
520
+ .b8 0 // End Of Children Mark
521
+ }
522
+ .section .debug_macinfo { }
SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.source ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":18:0)
2
+ #loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
3
+ #loc24 = loc(unknown)
4
+ #loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
5
+ #loc31 = loc("in_ptr0"(#loc))
6
+ #loc32 = loc("out_ptr0"(#loc))
7
+ #loc33 = loc("ks0"(#loc))
8
+ #loc34 = loc("xnumel"(#loc))
9
+ #loc35 = loc("r0_numel"(#loc))
10
+ #loc54 = loc("input"(#loc22))
11
+ #loc55 = loc("a"(#loc27))
12
+ #loc56 = loc("b"(#loc27))
13
+ module {
14
+ tt.func public @triton_per_fused__to_copy_mul_sum_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
15
+ %r0_numel_0 = arith.constant 32 : i32 loc(#loc36)
16
+ %xoffset = tt.get_program_id x : i32 loc(#loc37)
17
+ %xoffset_1 = arith.constant 128 : i32 loc(#loc38)
18
+ %xoffset_2 = arith.constant 128 : i32 loc(#loc38)
19
+ %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc38)
20
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39)
21
+ %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc40)
22
+ %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<128x1xi32> loc(#loc41)
23
+ %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<128x1xi32> loc(#loc41)
24
+ %xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32> loc(#loc42)
25
+ %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<128x1xi32> loc(#loc42)
26
+ %r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc43)
27
+ %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc44)
28
+ %r0_offset = arith.constant 0 : i32 loc(#loc45)
29
+ %r0_mask = arith.constant true loc(#loc46)
30
+ %r0_mask_9 = arith.constant dense<true> : tensor<128x32xi1> loc(#loc46)
31
+ %tmp0 = arith.extsi %r0_index_8 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc47)
32
+ %tmp0_10 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc47)
33
+ %tmp0_11 = arith.muli %tmp0_10, %tmp0 : tensor<1x32xi64> loc(#loc47)
34
+ %tmp0_12 = arith.extsi %xindex_6 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc48)
35
+ %tmp0_13 = tt.broadcast %tmp0_12 : tensor<128x1xi64> -> tensor<128x32xi64> loc(#loc48)
36
+ %tmp0_14 = tt.broadcast %tmp0_11 : tensor<1x32xi64> -> tensor<128x32xi64> loc(#loc48)
37
+ %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<128x32xi64> loc(#loc48)
38
+ %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc49)
39
+ %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi64> loc(#loc49)
40
+ %tmp0_18 = arith.constant 0.000000e+00 : f32 loc(#loc50)
41
+ %tmp0_19 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc50)
42
+ %tmp0_20 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc50)
43
+ %tmp0_21 = tt.load %tmp0_17, %tmp0_19, %tmp0_20 : tensor<128x32x!tt.ptr<f32>> loc(#loc50)
44
+ %tmp3 = arith.constant 0 : i32 loc(#loc51)
45
+ %tmp3_22 = arith.constant 0.000000e+00 : f32 loc(#loc51)
46
+ %tmp3_23 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc51)
47
+ %tmp3_24 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc51)
48
+ %tmp3_25 = arith.select %tmp3_24, %tmp0_21, %tmp3_23 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc51)
49
+ %tmp4 = tt.call @"triton.language.standard.sum__fp32S128_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp3_25) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc52)
50
+ %tmp4_26 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc53)
51
+ %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<128x1x!tt.ptr<bf16>> loc(#loc19)
52
+ %1 = tt.addptr %0, %xindex_6 : tensor<128x1x!tt.ptr<bf16>>, tensor<128x1xi32> loc(#loc19)
53
+ %2 = arith.truncf %tmp4_26 : tensor<128x1xf32> to tensor<128x1xbf16> loc(#loc20)
54
+ tt.store %1, %2, %xmask_7 : tensor<128x1x!tt.ptr<bf16>> loc(#loc20)
55
+ tt.return loc(#loc21)
56
+ } loc(#loc)
57
+ tt.func private @"triton.language.standard.sum__fp32S128_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x32xf32> loc("input"(#loc22))) -> tensor<128xf32> attributes {noinline = false} {
58
+ %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
59
+ ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
60
+ %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc23)
61
+ tt.reduce.return %2 : f32 loc(#loc23)
62
+ }) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc23)
63
+ tt.return %0 : tensor<128xf32> loc(#loc25)
64
+ ^bb1: // no predecessors
65
+ %1 = ub.poison : tensor<128xf32> loc(#loc26)
66
+ tt.return %1 : tensor<128xf32> loc(#loc26)
67
+ } loc(#loc22)
68
+ tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc27)), %b: f32 loc("b"(#loc27))) -> f32 attributes {noinline = false} {
69
+ %0 = arith.addf %a, %b : f32 loc(#loc28)
70
+ tt.return %0 : f32 loc(#loc29)
71
+ ^bb1: // no predecessors
72
+ %1 = ub.poison : f32 loc(#loc30)
73
+ tt.return %1 : f32 loc(#loc30)
74
+ } loc(#loc27)
75
+ } loc(#loc)
76
+ #loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":19:15)
77
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":23:28)
78
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":23:33)
79
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":24:36)
80
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":24:44)
81
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":24:23)
82
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":25:21)
83
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":26:28)
84
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":26:38)
85
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":27:16)
86
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":28:48)
87
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:39)
88
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:35)
89
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:30)
90
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:46)
91
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":35:33)
92
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":36:24)
93
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":36:27)
94
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":37:25)
95
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":37:36)
96
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":37:4)
97
+ #loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
98
+ #loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
99
+ #loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
100
+ #loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
101
+ #loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
102
+ #loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
103
+ #loc36 = loc("r0_numel"(#loc1))
104
+ #loc37 = loc("xoffset"(#loc2))
105
+ #loc38 = loc("xoffset"(#loc3))
106
+ #loc39 = loc("xindex"(#loc4))
107
+ #loc40 = loc("xindex"(#loc5))
108
+ #loc41 = loc("xindex"(#loc6))
109
+ #loc42 = loc("xmask"(#loc7))
110
+ #loc43 = loc("r0_index"(#loc8))
111
+ #loc44 = loc("r0_index"(#loc9))
112
+ #loc45 = loc("r0_offset"(#loc10))
113
+ #loc46 = loc("r0_mask"(#loc11))
114
+ #loc47 = loc("tmp0"(#loc12))
115
+ #loc48 = loc("tmp0"(#loc13))
116
+ #loc49 = loc("tmp0"(#loc14))
117
+ #loc50 = loc("tmp0"(#loc15))
118
+ #loc51 = loc("tmp3"(#loc16))
119
+ #loc52 = loc("tmp4"(#loc17))
120
+ #loc53 = loc("tmp4"(#loc18))
SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.ttgir ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
2
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":18:0)
3
+ #loc1 = loc(unknown)
4
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":36:24)
5
+ #loc20 = loc("in_ptr0"(#loc))
6
+ #loc21 = loc("out_ptr0"(#loc))
7
+ #loc22 = loc("ks0"(#loc))
8
+ #loc23 = loc("xnumel"(#loc))
9
+ #loc24 = loc("r0_numel"(#loc))
10
+ #loc36 = loc("tmp4"(#loc14))
11
+ #loc39 = loc(callsite(#loc1 at #loc36))
12
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
13
+ tt.func public @triton_per_fused__to_copy_mul_sum_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
14
+ %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf32, #blocked> loc(#loc1)
15
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
16
+ %xoffset = tt.get_program_id x : i32 loc(#loc25)
17
+ %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc26)
18
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc27)
19
+ %xindex_1 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc27)
20
+ %xindex_2 = tt.splat %xoffset_0 : i32 -> tensor<128x1xi32, #blocked> loc(#loc28)
21
+ %xindex_3 = arith.addi %xindex_2, %xindex_1 : tensor<128x1xi32, #blocked> loc(#loc28)
22
+ %xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32, #blocked> loc(#loc29)
23
+ %xmask_4 = arith.cmpi slt, %xindex_3, %xmask : tensor<128x1xi32, #blocked> loc(#loc29)
24
+ %r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc30)
25
+ %r0_index_5 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc30)
26
+ %tmp0 = arith.extsi %r0_index_5 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc31)
27
+ %tmp0_6 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc31)
28
+ %tmp0_7 = arith.muli %tmp0_6, %tmp0 : tensor<1x32xi64, #blocked> loc(#loc31)
29
+ %tmp0_8 = arith.extsi %xindex_3 : tensor<128x1xi32, #blocked> to tensor<128x1xi64, #blocked> loc(#loc32)
30
+ %tmp0_9 = tt.broadcast %tmp0_8 : tensor<128x1xi64, #blocked> -> tensor<128x32xi64, #blocked> loc(#loc32)
31
+ %tmp0_10 = tt.broadcast %tmp0_7 : tensor<1x32xi64, #blocked> -> tensor<128x32xi64, #blocked> loc(#loc32)
32
+ %tmp0_11 = arith.addi %tmp0_9, %tmp0_10 : tensor<128x32xi64, #blocked> loc(#loc32)
33
+ %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>, #blocked> loc(#loc33)
34
+ %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<128x32x!tt.ptr<f32>, #blocked>, tensor<128x32xi64, #blocked> loc(#loc33)
35
+ %tmp0_14 = tt.broadcast %xmask_4 : tensor<128x1xi1, #blocked> -> tensor<128x32xi1, #blocked> loc(#loc34)
36
+ %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst : tensor<128x32x!tt.ptr<f32>, #blocked> loc(#loc34)
37
+ %tmp3 = arith.select %tmp0_14, %tmp0_15, %cst : tensor<128x32xi1, #blocked>, tensor<128x32xf32, #blocked> loc(#loc35)
38
+ %tmp4 = "tt.reduce"(%tmp3) <{axis = 1 : i32}> ({
39
+ ^bb0(%tmp4_17: f32 loc(callsite(#loc1 at #loc36)), %tmp4_18: f32 loc(callsite(#loc1 at #loc36))):
40
+ %tmp4_19 = arith.addf %tmp4_17, %tmp4_18 : f32 loc(#loc40)
41
+ tt.reduce.return %tmp4_19 : f32 loc(#loc38)
42
+ }) : (tensor<128x32xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc38)
43
+ %tmp4_16 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xf32, #blocked> loc(#loc37)
44
+ %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<128x1x!tt.ptr<bf16>, #blocked> loc(#loc17)
45
+ %1 = tt.addptr %0, %xindex_3 : tensor<128x1x!tt.ptr<bf16>, #blocked>, tensor<128x1xi32, #blocked> loc(#loc17)
46
+ %2 = arith.truncf %tmp4_16 : tensor<128x1xf32, #blocked> to tensor<128x1xbf16, #blocked> loc(#loc18)
47
+ tt.store %1, %2, %xmask_4 : tensor<128x1x!tt.ptr<bf16>, #blocked> loc(#loc18)
48
+ tt.return loc(#loc19)
49
+ } loc(#loc)
50
+ } loc(#loc)
51
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":23:28)
52
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":23:33)
53
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":24:44)
54
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":24:23)
55
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":25:21)
56
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":26:38)
57
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:39)
58
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:35)
59
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:30)
60
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:46)
61
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":35:33)
62
+ #loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
63
+ #loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
64
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":36:27)
65
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":37:25)
66
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":37:36)
67
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":37:4)
68
+ #loc25 = loc("xoffset"(#loc2))
69
+ #loc26 = loc("xoffset"(#loc3))
70
+ #loc27 = loc("xindex"(#loc4))
71
+ #loc28 = loc("xindex"(#loc5))
72
+ #loc29 = loc("xmask"(#loc6))
73
+ #loc30 = loc("r0_index"(#loc7))
74
+ #loc31 = loc("tmp0"(#loc8))
75
+ #loc32 = loc("tmp0"(#loc9))
76
+ #loc33 = loc("tmp0"(#loc10))
77
+ #loc34 = loc("tmp0"(#loc11))
78
+ #loc35 = loc("tmp3"(#loc12))
79
+ #loc37 = loc("tmp4"(#loc16))
80
+ #loc38 = loc(callsite(#loc13 at #loc36))
81
+ #loc40 = loc(callsite(#loc15 at #loc38))
SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.ttir ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":36:24)
4
+ #loc22 = loc("in_ptr0"(#loc))
5
+ #loc23 = loc("out_ptr0"(#loc))
6
+ #loc24 = loc("ks0"(#loc))
7
+ #loc25 = loc("xnumel"(#loc))
8
+ #loc26 = loc("r0_numel"(#loc))
9
+ #loc40 = loc("tmp4"(#loc16))
10
+ #loc43 = loc(callsite(#loc1 at #loc40))
11
+ module {
12
+ tt.func public @triton_per_fused__to_copy_mul_sum_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
13
+ %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc1)
14
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
15
+ %xoffset = tt.get_program_id x : i32 loc(#loc27)
16
+ %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc28)
17
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc29)
18
+ %xindex_1 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc30)
19
+ %xindex_2 = tt.splat %xoffset_0 : i32 -> tensor<128x1xi32> loc(#loc31)
20
+ %xindex_3 = arith.addi %xindex_2, %xindex_1 : tensor<128x1xi32> loc(#loc31)
21
+ %xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32> loc(#loc32)
22
+ %xmask_4 = arith.cmpi slt, %xindex_3, %xmask : tensor<128x1xi32> loc(#loc32)
23
+ %r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc33)
24
+ %r0_index_5 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc34)
25
+ %tmp0 = arith.extsi %r0_index_5 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc35)
26
+ %tmp0_6 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc35)
27
+ %tmp0_7 = arith.muli %tmp0_6, %tmp0 : tensor<1x32xi64> loc(#loc35)
28
+ %tmp0_8 = arith.extsi %xindex_3 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc36)
29
+ %tmp0_9 = tt.broadcast %tmp0_8 : tensor<128x1xi64> -> tensor<128x32xi64> loc(#loc36)
30
+ %tmp0_10 = tt.broadcast %tmp0_7 : tensor<1x32xi64> -> tensor<128x32xi64> loc(#loc36)
31
+ %tmp0_11 = arith.addi %tmp0_9, %tmp0_10 : tensor<128x32xi64> loc(#loc36)
32
+ %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc37)
33
+ %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi64> loc(#loc37)
34
+ %tmp0_14 = tt.broadcast %xmask_4 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc38)
35
+ %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst : tensor<128x32x!tt.ptr<f32>> loc(#loc38)
36
+ %tmp3 = arith.select %tmp0_14, %tmp0_15, %cst : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc39)
37
+ %tmp4 = "tt.reduce"(%tmp3) <{axis = 1 : i32}> ({
38
+ ^bb0(%tmp4_17: f32 loc(callsite(#loc1 at #loc40)), %tmp4_18: f32 loc(callsite(#loc1 at #loc40))):
39
+ %tmp4_19 = arith.addf %tmp4_17, %tmp4_18 : f32 loc(#loc44)
40
+ tt.reduce.return %tmp4_19 : f32 loc(#loc42)
41
+ }) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc42)
42
+ %tmp4_16 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc41)
43
+ %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<128x1x!tt.ptr<bf16>> loc(#loc19)
44
+ %1 = tt.addptr %0, %xindex_3 : tensor<128x1x!tt.ptr<bf16>>, tensor<128x1xi32> loc(#loc19)
45
+ %2 = arith.truncf %tmp4_16 : tensor<128x1xf32> to tensor<128x1xbf16> loc(#loc20)
46
+ tt.store %1, %2, %xmask_4 : tensor<128x1x!tt.ptr<bf16>> loc(#loc20)
47
+ tt.return loc(#loc21)
48
+ } loc(#loc)
49
+ } loc(#loc)
50
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":23:28)
51
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":23:33)
52
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":24:36)
53
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":24:44)
54
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":24:23)
55
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":25:21)
56
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":26:28)
57
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":26:38)
58
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:39)
59
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:35)
60
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:30)
61
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":33:46)
62
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":35:33)
63
+ #loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
64
+ #loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
65
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":36:27)
66
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":37:25)
67
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":37:36)
68
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/yf/cyft7sialepriw6eujulaxpi57qlrafkmp4k2kjwzw4noh23ddz6.py":37:4)
69
+ #loc27 = loc("xoffset"(#loc2))
70
+ #loc28 = loc("xoffset"(#loc3))
71
+ #loc29 = loc("xindex"(#loc4))
72
+ #loc30 = loc("xindex"(#loc5))
73
+ #loc31 = loc("xindex"(#loc6))
74
+ #loc32 = loc("xmask"(#loc7))
75
+ #loc33 = loc("r0_index"(#loc8))
76
+ #loc34 = loc("r0_index"(#loc9))
77
+ #loc35 = loc("tmp0"(#loc10))
78
+ #loc36 = loc("tmp0"(#loc11))
79
+ #loc37 = loc("tmp0"(#loc12))
80
+ #loc38 = loc("tmp0"(#loc13))
81
+ #loc39 = loc("tmp3"(#loc14))
82
+ #loc41 = loc("tmp4"(#loc18))
83
+ #loc42 = loc(callsite(#loc15 at #loc40))
84
+ #loc44 = loc(callsite(#loc17 at #loc42))
SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json"}}
SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin ADDED
Binary file (31.5 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":18:0)
2
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":49:33)
3
+ #loc3 = loc(unknown)
4
+ #loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28)
5
+ #loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58)
6
+ #loc59 = loc("in_ptr0"(#loc))
7
+ #loc60 = loc("out_ptr2"(#loc))
8
+ #loc61 = loc("xnumel"(#loc))
9
+ #loc62 = loc("r0_numel"(#loc))
10
+ #loc90 = loc("out_max"(#loc35))
11
+ #loc96 = loc("out_sum"(#loc42))
12
+ #loc123 = loc(callsite(#loc90 at #loc2))
13
+ #loc129 = loc(callsite(#loc96 at #loc2))
14
+ #loc138 = loc(callsite(#loc3 at #loc123))
15
+ #loc141 = loc(callsite(#loc3 at #loc129))
16
+ module {
17
+ tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
18
+ %delta = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc108)
19
+ %cst = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc3)
20
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc3)
21
+ %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc3)
22
+ %c2048_i32 = arith.constant 2048 : i32 loc(#loc3)
23
+ %c32000_i32 = arith.constant 32000 : i32 loc(#loc3)
24
+ %c0_i32 = arith.constant 0 : i32 loc(#loc3)
25
+ %cst_2 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc3)
26
+ %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc3)
27
+ %xoffset = tt.get_program_id x : i32 loc(#loc64)
28
+ %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc65)
29
+ %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc66)
30
+ %_tmp3_sum:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp3_max = %cst_3, %_tmp3_sum_12 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 {
31
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc68)
32
+ %r0_index_13 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc68)
33
+ %r0_mask = arith.cmpi slt, %r0_index_13, %cst_2 : tensor<1x2048xi32> loc(#loc69)
34
+ %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc70)
35
+ %tmp0_14 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc110)
36
+ %tmp0_15 = arith.addi %r0_index_13, %tmp0_14 : tensor<1x2048xi32> loc(#loc71)
37
+ %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc72)
38
+ %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc72)
39
+ %tmp0_18 = tt.load %tmp0_17, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc73)
40
+ %tmp0_19 = arith.extf %tmp0_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc74)
41
+ %mask = arith.cmpf ogt, %_tmp3_max, %tmp0_19 : tensor<1x2048xf32> loc(#loc131)
42
+ %mask_20 = arith.cmpf une, %_tmp3_max, %_tmp3_max : tensor<1x2048xf32> loc(#loc132)
43
+ %mask_21 = arith.ori %mask, %mask_20 : tensor<1x2048xi1> loc(#loc133)
44
+ %out_max_22 = arith.select %mask_21, %_tmp3_max, %tmp0_19 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc134)
45
+ %lhs_scale = arith.cmpf oeq, %out_max_22, %cst_3 : tensor<1x2048xf32> loc(#loc114)
46
+ %lhs_scale_23 = arith.subf %_tmp3_max, %out_max_22 : tensor<1x2048xf32> loc(#loc115)
47
+ %lhs_scale_24 = tt.extern_elementwise %lhs_scale_23 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc135)
48
+ %lhs_scale_25 = arith.select %lhs_scale, %cst, %lhs_scale_24 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc117)
49
+ %rhs_scale = arith.subf %tmp0_19, %out_max_22 : tensor<1x2048xf32> loc(#loc118)
50
+ %rhs_scale_26 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc136)
51
+ %rhs_scale_27 = arith.select %lhs_scale, %cst, %rhs_scale_26 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc120)
52
+ %out_sum_28 = arith.mulf %_tmp3_sum_12, %lhs_scale_25 : tensor<1x2048xf32> loc(#loc121)
53
+ %out_sum_29 = arith.addf %out_sum_28, %rhs_scale_27 : tensor<1x2048xf32> loc(#loc122)
54
+ %_tmp3_max_30 = arith.select %r0_mask, %out_max_22, %_tmp3_max : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc88)
55
+ %_tmp3_sum_31 = arith.select %r0_mask, %out_sum_29, %_tmp3_sum_12 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc89)
56
+ scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc33)
57
+ } loc(#loc109)
58
+ %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({
59
+ ^bb0(%out_max_12: f32 loc(callsite(#loc3 at #loc123)), %out_max_13: f32 loc(callsite(#loc3 at #loc123))):
60
+ %mask = arith.cmpf ogt, %out_max_12, %out_max_13 : f32 loc(#loc142)
61
+ %mask_14 = arith.cmpf une, %out_max_12, %out_max_12 : f32 loc(#loc143)
62
+ %mask_15 = arith.ori %mask, %mask_14 : i1 loc(#loc144)
63
+ %out_max_16 = arith.select %mask_15, %out_max_12, %out_max_13 : f32 loc(#loc145)
64
+ tt.reduce.return %out_max_16 : f32 loc(#loc137)
65
+ }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc137)
66
+ %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc124)
67
+ %delta_5 = arith.cmpf oeq, %out_max_keepdim, %delta : tensor<1x1xf32> loc(#loc108)
68
+ %delta_6 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc125)
69
+ %delta_7 = arith.subf %_tmp3_sum#0, %delta_6 : tensor<1x2048xf32> loc(#loc125)
70
+ %delta_8 = tt.broadcast %delta_5 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc126)
71
+ %delta_9 = arith.select %delta_8, %cst_0, %delta_7 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc126)
72
+ %out_sum = tt.extern_elementwise %delta_9 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc139)
73
+ %out_sum_10 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x2048xf32> loc(#loc128)
74
+ %out_sum_11 = "tt.reduce"(%out_sum_10) <{axis = 1 : i32}> ({
75
+ ^bb0(%out_sum_12: f32 loc(callsite(#loc3 at #loc129)), %out_sum_13: f32 loc(callsite(#loc3 at #loc129))):
76
+ %out_sum_14 = arith.addf %out_sum_12, %out_sum_13 : f32 loc(#loc146)
77
+ tt.reduce.return %out_sum_14 : f32 loc(#loc140)
78
+ }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc140)
79
+ %tmp4 = tt.expand_dims %out_sum_11 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc97)
80
+ scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 : i32 {
81
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc98)
82
+ %r0_index_12 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc98)
83
+ %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x2048xi32> loc(#loc99)
84
+ %tmp5 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc100)
85
+ %tmp5_13 = tt.splat %tmp5 : i32 -> tensor<1x2048xi32> loc(#loc130)
86
+ %tmp5_14 = arith.addi %r0_index_12, %tmp5_13 : tensor<1x2048xi32> loc(#loc101)
87
+ %tmp5_15 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc102)
88
+ %tmp5_16 = tt.addptr %tmp5_15, %tmp5_14 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc102)
89
+ %tmp5_17 = tt.load %tmp5_16, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc103)
90
+ %tmp5_18 = arith.extf %tmp5_17 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc104)
91
+ %tmp7 = arith.subf %tmp5_18, %delta_6 : tensor<1x2048xf32> loc(#loc105)
92
+ %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc106)
93
+ %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc107)
94
+ %tmp9_19 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32> loc(#loc107)
95
+ %0 = tt.splat %out_ptr2 : !tt.ptr<f32> -> tensor<1x2048x!tt.ptr<f32>> loc(#loc56)
96
+ %1 = tt.addptr %0, %tmp5_14 : tensor<1x2048x!tt.ptr<f32>>, tensor<1x2048xi32> loc(#loc56)
97
+ tt.store %1, %tmp9_19, %r0_mask : tensor<1x2048x!tt.ptr<f32>> loc(#loc57)
98
+ } loc(#loc45)
99
+ tt.return loc(#loc58)
100
+ } loc(#loc)
101
+ } loc(#loc)
102
+ #loc1 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40)
103
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:28)
104
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:27)
105
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:37)
106
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":31:40)
107
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":32:31)
108
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":33:29)
109
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:47)
110
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:41)
111
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:34)
112
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:52)
113
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:105)
114
+ #loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15)
115
+ #loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31)
116
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":42:40)
117
+ #loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21)
118
+ #loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16)
119
+ #loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29)
120
+ #loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19)
121
+ #loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53)
122
+ #loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29)
123
+ #loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62)
124
+ #loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39)
125
+ #loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53)
126
+ #loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62)
127
+ #loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39)
128
+ #loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24)
129
+ #loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36)
130
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":45:54)
131
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:54)
132
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:8)
133
+ #loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29)
134
+ #loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46)
135
+ #loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68)
136
+ #loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58)
137
+ #loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42)
138
+ #loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31)
139
+ #loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
140
+ #loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
141
+ #loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":51:16)
142
+ #loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:40)
143
+ #loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":53:31)
144
+ #loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":54:29)
145
+ #loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:47)
146
+ #loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:41)
147
+ #loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:34)
148
+ #loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:52)
149
+ #loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:106)
150
+ #loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":60:22)
151
+ #loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":61:29)
152
+ #loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":62:23)
153
+ #loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:29)
154
+ #loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:53)
155
+ #loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:4)
156
+ #loc63 = loc("delta"(#loc1))
157
+ #loc64 = loc("xoffset"(#loc4))
158
+ #loc65 = loc("r0_base"(#loc5))
159
+ #loc66 = loc("r0_base"(#loc6))
160
+ #loc67 = loc("_tmp3_max"(#loc7))
161
+ #loc68 = loc("r0_index"(#loc8))
162
+ #loc69 = loc("r0_mask"(#loc9))
163
+ #loc70 = loc("tmp0"(#loc10))
164
+ #loc71 = loc("tmp0"(#loc11))
165
+ #loc72 = loc("tmp0"(#loc12))
166
+ #loc73 = loc("tmp0"(#loc13))
167
+ #loc74 = loc("tmp0"(#loc14))
168
+ #loc75 = loc("mask"(#loc15))
169
+ #loc76 = loc("out_max"(#loc16))
170
+ #loc77 = loc("mask"(#loc18))
171
+ #loc78 = loc("mask"(#loc19))
172
+ #loc79 = loc("lhs_scale"(#loc21))
173
+ #loc80 = loc("lhs_scale"(#loc22))
174
+ #loc81 = loc("lhs_scale"(#loc24))
175
+ #loc82 = loc("lhs_scale"(#loc25))
176
+ #loc83 = loc("rhs_scale"(#loc26))
177
+ #loc84 = loc("rhs_scale"(#loc27))
178
+ #loc85 = loc("rhs_scale"(#loc28))
179
+ #loc86 = loc("out_sum"(#loc29))
180
+ #loc87 = loc("out_sum"(#loc30))
181
+ #loc88 = loc("_tmp3_max"(#loc31))
182
+ #loc89 = loc("_tmp3_sum"(#loc32))
183
+ #loc91 = loc("out_max_keepdim"(#loc36))
184
+ #loc92 = loc("delta"(#loc37))
185
+ #loc93 = loc("delta"(#loc38))
186
+ #loc94 = loc("out_sum"(#loc39))
187
+ #loc95 = loc("out_sum"(#loc40))
188
+ #loc97 = loc("tmp4"(#loc44))
189
+ #loc98 = loc("r0_index"(#loc46))
190
+ #loc99 = loc("r0_mask"(#loc47))
191
+ #loc100 = loc("tmp5"(#loc48))
192
+ #loc101 = loc("tmp5"(#loc49))
193
+ #loc102 = loc("tmp5"(#loc50))
194
+ #loc103 = loc("tmp5"(#loc51))
195
+ #loc104 = loc("tmp5"(#loc52))
196
+ #loc105 = loc("tmp7"(#loc53))
197
+ #loc106 = loc("tmp8"(#loc54))
198
+ #loc107 = loc("tmp9"(#loc55))
199
+ #loc108 = loc(callsite(#loc63 at #loc2))
200
+ #loc109 = loc("_tmp3_sum"(#loc67))
201
+ #loc110 = loc(fused[#loc71, #loc70])
202
+ #loc111 = loc("mask"(#loc75))
203
+ #loc112 = loc(callsite(#loc76 at #loc17))
204
+ #loc113 = loc("mask"(#loc78))
205
+ #loc114 = loc(callsite(#loc79 at #loc17))
206
+ #loc115 = loc(callsite(#loc80 at #loc17))
207
+ #loc116 = loc(callsite(#loc81 at #loc17))
208
+ #loc117 = loc(callsite(#loc82 at #loc17))
209
+ #loc118 = loc(callsite(#loc83 at #loc17))
210
+ #loc119 = loc(callsite(#loc84 at #loc17))
211
+ #loc120 = loc(callsite(#loc85 at #loc17))
212
+ #loc121 = loc(callsite(#loc86 at #loc17))
213
+ #loc122 = loc(callsite(#loc87 at #loc17))
214
+ #loc124 = loc(callsite(#loc91 at #loc2))
215
+ #loc125 = loc(callsite(#loc92 at #loc2))
216
+ #loc126 = loc(callsite(#loc93 at #loc2))
217
+ #loc127 = loc(callsite(#loc94 at #loc2))
218
+ #loc128 = loc(callsite(#loc95 at #loc2))
219
+ #loc130 = loc(fused[#loc101, #loc100])
220
+ #loc131 = loc(callsite(#loc111 at #loc112))
221
+ #loc132 = loc(callsite(#loc77 at #loc112))
222
+ #loc133 = loc(callsite(#loc113 at #loc112))
223
+ #loc134 = loc(callsite(#loc20 at #loc112))
224
+ #loc135 = loc(callsite(#loc23 at #loc116))
225
+ #loc136 = loc(callsite(#loc23 at #loc119))
226
+ #loc137 = loc(callsite(#loc34 at #loc123))
227
+ #loc139 = loc(callsite(#loc23 at #loc127))
228
+ #loc140 = loc(callsite(#loc41 at #loc129))
229
+ #loc142 = loc(callsite(#loc111 at #loc137))
230
+ #loc143 = loc(callsite(#loc77 at #loc137))
231
+ #loc144 = loc(callsite(#loc113 at #loc137))
232
+ #loc145 = loc(callsite(#loc20 at #loc137))
233
+ #loc146 = loc(callsite(#loc43 at #loc140))
SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json"}}
SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin ADDED
Binary file (95 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "2d88f34765cb5a3f781885efc3427f98913e07cbedc15d0af342925a3b9998ab", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0"}
SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir ADDED
@@ -0,0 +1,808 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @assertFunc_2 = internal constant [8 x i8] c"unknown\00"
6
+ @assertFile_2 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py\00"
7
+ @assertMessage_2 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp36 < ks3\00"
8
+ @assertFunc_1 = internal constant [8 x i8] c"unknown\00"
9
+ @assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py\00"
10
+ @assertMessage_1 = internal constant [65 x i8] c"index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2\00"
11
+ @assertFunc_0 = internal constant [8 x i8] c"unknown\00"
12
+ @assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py\00"
13
+ @assertMessage_0 = internal constant [64 x i8] c"index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2\00"
14
+
15
+ ; Function Attrs: noreturn
16
+ declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0
17
+
18
+ define ptx_kernel void @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, ptr addrspace(1) readnone captures(none) %10, ptr addrspace(1) readnone captures(none) %11) local_unnamed_addr #1 !dbg !9 {
19
+ %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10
20
+ %14 = shl i32 %13, 10, !dbg !11
21
+ %15 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12
22
+ %16 = shl nuw nsw i32 %15, 3, !dbg !12
23
+ %17 = and i32 %16, 1016, !dbg !12
24
+ %18 = insertelement <8 x i32> poison, i32 %9, i64 0, !dbg !13
25
+ %19 = shufflevector <8 x i32> %18, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !13
26
+ %20 = or disjoint i32 %17, %14, !dbg !14
27
+ %21 = or disjoint i32 %20, 1, !dbg !14
28
+ %22 = or disjoint i32 %20, 2, !dbg !14
29
+ %23 = or disjoint i32 %20, 3, !dbg !14
30
+ %24 = or disjoint i32 %20, 4, !dbg !14
31
+ %25 = or disjoint i32 %20, 5, !dbg !14
32
+ %26 = or disjoint i32 %20, 6, !dbg !14
33
+ %27 = or disjoint i32 %20, 7, !dbg !14
34
+ %28 = insertelement <8 x i32> poison, i32 %20, i64 0, !dbg !13
35
+ %29 = insertelement <8 x i32> %28, i32 %21, i64 1, !dbg !13
36
+ %30 = insertelement <8 x i32> %29, i32 %22, i64 2, !dbg !13
37
+ %31 = insertelement <8 x i32> %30, i32 %23, i64 3, !dbg !13
38
+ %32 = insertelement <8 x i32> %31, i32 %24, i64 4, !dbg !13
39
+ %33 = insertelement <8 x i32> %32, i32 %25, i64 5, !dbg !13
40
+ %34 = insertelement <8 x i32> %33, i32 %26, i64 6, !dbg !13
41
+ %35 = insertelement <8 x i32> %34, i32 %27, i64 7, !dbg !13
42
+ %36 = icmp slt <8 x i32> %35, %19, !dbg !13
43
+ %37 = sext <8 x i32> %35 to <8 x i64>, !dbg !15
44
+ %38 = insertelement <8 x i64> poison, i64 %5, i64 0, !dbg !15
45
+ %39 = shufflevector <8 x i64> %38, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !15
46
+ %40 = srem <8 x i64> %37, %39, !dbg !15
47
+ %41 = extractelement <8 x i64> %37, i64 0, !dbg !16
48
+ %42 = sdiv i64 %41, %5, !dbg !17
49
+ %43 = extractelement <8 x i64> %37, i64 1, !dbg !16
50
+ %44 = sdiv i64 %43, %5, !dbg !17
51
+ %45 = extractelement <8 x i64> %37, i64 2, !dbg !16
52
+ %46 = sdiv i64 %45, %5, !dbg !17
53
+ %47 = extractelement <8 x i64> %37, i64 3, !dbg !16
54
+ %48 = sdiv i64 %47, %5, !dbg !17
55
+ %49 = extractelement <8 x i64> %37, i64 4, !dbg !16
56
+ %50 = sdiv i64 %49, %5, !dbg !17
57
+ %51 = extractelement <8 x i64> %37, i64 5, !dbg !16
58
+ %52 = sdiv i64 %51, %5, !dbg !17
59
+ %53 = extractelement <8 x i64> %37, i64 6, !dbg !16
60
+ %54 = sdiv i64 %53, %5, !dbg !17
61
+ %55 = extractelement <8 x i64> %37, i64 7, !dbg !16
62
+ %56 = sdiv i64 %55, %5, !dbg !17
63
+ %57 = srem i64 %42, %6, !dbg !18
64
+ %58 = srem i64 %44, %6, !dbg !18
65
+ %59 = srem i64 %46, %6, !dbg !18
66
+ %60 = srem i64 %48, %6, !dbg !18
67
+ %61 = srem i64 %50, %6, !dbg !18
68
+ %62 = srem i64 %52, %6, !dbg !18
69
+ %63 = srem i64 %54, %6, !dbg !18
70
+ %64 = srem i64 %56, %6, !dbg !18
71
+ %65 = getelementptr bfloat, ptr addrspace(1) %0, i64 %41, !dbg !16
72
+ %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %43, !dbg !16
73
+ %67 = getelementptr bfloat, ptr addrspace(1) %0, i64 %45, !dbg !16
74
+ %68 = getelementptr bfloat, ptr addrspace(1) %0, i64 %47, !dbg !16
75
+ %69 = getelementptr bfloat, ptr addrspace(1) %0, i64 %49, !dbg !16
76
+ %70 = getelementptr bfloat, ptr addrspace(1) %0, i64 %51, !dbg !16
77
+ %71 = getelementptr bfloat, ptr addrspace(1) %0, i64 %53, !dbg !16
78
+ %72 = getelementptr bfloat, ptr addrspace(1) %0, i64 %55, !dbg !16
79
+ %73 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19
80
+ %74 = extractelement <8 x i1> %36, i64 0, !dbg !19
81
+ %75 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %65, i64 %73, i1 %74) #4, !dbg !19
82
+ %76 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19
83
+ %77 = extractelement <8 x i1> %36, i64 1, !dbg !19
84
+ %78 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %66, i64 %76, i1 %77) #4, !dbg !19
85
+ %79 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19
86
+ %80 = extractelement <8 x i1> %36, i64 2, !dbg !19
87
+ %81 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %67, i64 %79, i1 %80) #4, !dbg !19
88
+ %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19
89
+ %83 = extractelement <8 x i1> %36, i64 3, !dbg !19
90
+ %84 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %68, i64 %82, i1 %83) #4, !dbg !19
91
+ %85 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19
92
+ %86 = extractelement <8 x i1> %36, i64 4, !dbg !19
93
+ %87 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %69, i64 %85, i1 %86) #4, !dbg !19
94
+ %88 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19
95
+ %89 = extractelement <8 x i1> %36, i64 5, !dbg !19
96
+ %90 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %70, i64 %88, i1 %89) #4, !dbg !19
97
+ %91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19
98
+ %92 = extractelement <8 x i1> %36, i64 6, !dbg !19
99
+ %93 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %71, i64 %91, i1 %92) #4, !dbg !19
100
+ %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19
101
+ %95 = extractelement <8 x i1> %36, i64 7, !dbg !19
102
+ %96 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %72, i64 %94, i1 %95) #4, !dbg !19
103
+ %97 = getelementptr i64, ptr addrspace(1) %1, i64 %57, !dbg !20
104
+ %98 = getelementptr i64, ptr addrspace(1) %1, i64 %58, !dbg !20
105
+ %99 = getelementptr i64, ptr addrspace(1) %1, i64 %59, !dbg !20
106
+ %100 = getelementptr i64, ptr addrspace(1) %1, i64 %60, !dbg !20
107
+ %101 = getelementptr i64, ptr addrspace(1) %1, i64 %61, !dbg !20
108
+ %102 = getelementptr i64, ptr addrspace(1) %1, i64 %62, !dbg !20
109
+ %103 = getelementptr i64, ptr addrspace(1) %1, i64 %63, !dbg !20
110
+ %104 = getelementptr i64, ptr addrspace(1) %1, i64 %64, !dbg !20
111
+ %105 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21
112
+ %106 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %97, i64 %105, i1 %74) #4, !dbg !21
113
+ %107 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21
114
+ %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %98, i64 %107, i1 %77) #4, !dbg !21
115
+ %109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21
116
+ %110 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %99, i64 %109, i1 %80) #4, !dbg !21
117
+ %111 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21
118
+ %112 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %100, i64 %111, i1 %83) #4, !dbg !21
119
+ %113 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21
120
+ %114 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %101, i64 %113, i1 %86) #4, !dbg !21
121
+ %115 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21
122
+ %116 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %102, i64 %115, i1 %89) #4, !dbg !21
123
+ %117 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21
124
+ %118 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %103, i64 %117, i1 %92) #4, !dbg !21
125
+ %119 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21
126
+ %120 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %119, i1 %95) #4, !dbg !21
127
+ %121 = sdiv i64 %5, 2, !dbg !22
128
+ %122 = sub nsw i64 %41, %121, !dbg !23
129
+ %123 = sub nsw i64 %43, %121, !dbg !23
130
+ %124 = sub nsw i64 %45, %121, !dbg !23
131
+ %125 = sub nsw i64 %47, %121, !dbg !23
132
+ %126 = sub nsw i64 %49, %121, !dbg !23
133
+ %127 = sub nsw i64 %51, %121, !dbg !23
134
+ %128 = sub nsw i64 %53, %121, !dbg !23
135
+ %129 = sub nsw i64 %55, %121, !dbg !23
136
+ %130 = getelementptr bfloat, ptr addrspace(1) %0, i64 %122, !dbg !24
137
+ %131 = getelementptr bfloat, ptr addrspace(1) %0, i64 %123, !dbg !24
138
+ %132 = getelementptr bfloat, ptr addrspace(1) %0, i64 %124, !dbg !24
139
+ %133 = getelementptr bfloat, ptr addrspace(1) %0, i64 %125, !dbg !24
140
+ %134 = getelementptr bfloat, ptr addrspace(1) %0, i64 %126, !dbg !24
141
+ %135 = getelementptr bfloat, ptr addrspace(1) %0, i64 %127, !dbg !24
142
+ %136 = getelementptr bfloat, ptr addrspace(1) %0, i64 %128, !dbg !24
143
+ %137 = getelementptr bfloat, ptr addrspace(1) %0, i64 %129, !dbg !24
144
+ %138 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25
145
+ %139 = insertelement <8 x i64> poison, i64 %121, i64 0, !dbg !26
146
+ %140 = shufflevector <8 x i64> %139, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !26
147
+ %141 = icmp sge <8 x i64> %40, %140, !dbg !26
148
+ %142 = and <8 x i1> %36, %141, !dbg !27
149
+ %143 = extractelement <8 x i1> %142, i64 0, !dbg !28
150
+ %144 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %130, i64 %138, i1 %143) #4, !dbg !25
151
+ %145 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25
152
+ %146 = extractelement <8 x i1> %142, i64 1, !dbg !28
153
+ %147 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %131, i64 %145, i1 %146) #4, !dbg !25
154
+ %148 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25
155
+ %149 = extractelement <8 x i1> %142, i64 2, !dbg !28
156
+ %150 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %132, i64 %148, i1 %149) #4, !dbg !25
157
+ %151 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25
158
+ %152 = extractelement <8 x i1> %142, i64 3, !dbg !28
159
+ %153 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %133, i64 %151, i1 %152) #4, !dbg !25
160
+ %154 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25
161
+ %155 = extractelement <8 x i1> %142, i64 4, !dbg !28
162
+ %156 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %134, i64 %154, i1 %155) #4, !dbg !25
163
+ %157 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25
164
+ %158 = extractelement <8 x i1> %142, i64 5, !dbg !28
165
+ %159 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %135, i64 %157, i1 %158) #4, !dbg !25
166
+ %160 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25
167
+ %161 = extractelement <8 x i1> %142, i64 6, !dbg !28
168
+ %162 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %136, i64 %160, i1 %161) #4, !dbg !25
169
+ %163 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25
170
+ %164 = extractelement <8 x i1> %142, i64 7, !dbg !28
171
+ %165 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %137, i64 %163, i1 %164) #4, !dbg !25
172
+ %166 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28
173
+ %167 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %97, i64 %166, i1 %143) #4, !dbg !28
174
+ %168 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28
175
+ %169 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %98, i64 %168, i1 %146) #4, !dbg !28
176
+ %170 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28
177
+ %171 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %99, i64 %170, i1 %149) #4, !dbg !28
178
+ %172 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28
179
+ %173 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %100, i64 %172, i1 %152) #4, !dbg !28
180
+ %174 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28
181
+ %175 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %101, i64 %174, i1 %155) #4, !dbg !28
182
+ %176 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28
183
+ %177 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %102, i64 %176, i1 %158) #4, !dbg !28
184
+ %178 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28
185
+ %179 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %103, i64 %178, i1 %161) #4, !dbg !28
186
+ %180 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28
187
+ %181 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %180, i1 %164) #4, !dbg !28
188
+ %182 = insertelement <8 x i64> poison, i64 %167, i64 0, !dbg !29
189
+ %183 = insertelement <8 x i64> %182, i64 %169, i64 1, !dbg !29
190
+ %184 = insertelement <8 x i64> %183, i64 %171, i64 2, !dbg !29
191
+ %185 = insertelement <8 x i64> %184, i64 %173, i64 3, !dbg !29
192
+ %186 = insertelement <8 x i64> %185, i64 %175, i64 4, !dbg !29
193
+ %187 = insertelement <8 x i64> %186, i64 %177, i64 5, !dbg !29
194
+ %188 = insertelement <8 x i64> %187, i64 %179, i64 6, !dbg !29
195
+ %189 = insertelement <8 x i64> %188, i64 %181, i64 7, !dbg !29
196
+ %190 = icmp slt <8 x i64> %189, zeroinitializer, !dbg !29
197
+ %191 = insertelement <8 x i64> poison, i64 %7, i64 0, !dbg !30
198
+ %192 = shufflevector <8 x i64> %191, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !30
199
+ %193 = select <8 x i1> %190, <8 x i64> %192, <8 x i64> zeroinitializer, !dbg !30
200
+ %194 = add <8 x i64> %193, %189, !dbg !30
201
+ %195 = icmp slt <8 x i64> %194, zeroinitializer, !dbg !31
202
+ %196 = icmp sge <8 x i64> %194, %192, !dbg !32
203
+ %197 = or <8 x i1> %195, %196, !dbg !33
204
+ %198 = and <8 x i1> %142, %197, !dbg !34
205
+ %199 = bitcast <8 x i1> %198 to i8, !dbg !35
206
+ %.not = icmp eq i8 %199, 0, !dbg !35
207
+ br i1 %.not, label %201, label %200, !dbg !35
208
+
209
+ 200: ; preds = %12
210
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 36, ptr nonnull @assertFunc_0, i64 1), !dbg !35
211
+ unreachable, !dbg !35
212
+
213
+ 201: ; preds = %12
214
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !35
215
+ %202 = extractelement <8 x i64> %40, i64 0, !dbg !36
216
+ %203 = sub nsw i64 %202, %121, !dbg !36
217
+ %204 = extractelement <8 x i64> %40, i64 1, !dbg !36
218
+ %205 = sub nsw i64 %204, %121, !dbg !36
219
+ %206 = extractelement <8 x i64> %40, i64 2, !dbg !36
220
+ %207 = sub nsw i64 %206, %121, !dbg !36
221
+ %208 = extractelement <8 x i64> %40, i64 3, !dbg !36
222
+ %209 = sub nsw i64 %208, %121, !dbg !36
223
+ %210 = extractelement <8 x i64> %40, i64 4, !dbg !36
224
+ %211 = sub nsw i64 %210, %121, !dbg !36
225
+ %212 = extractelement <8 x i64> %40, i64 5, !dbg !36
226
+ %213 = sub nsw i64 %212, %121, !dbg !36
227
+ %214 = extractelement <8 x i64> %40, i64 6, !dbg !36
228
+ %215 = sub nsw i64 %214, %121, !dbg !36
229
+ %216 = extractelement <8 x i64> %40, i64 7, !dbg !36
230
+ %217 = sub nsw i64 %216, %121, !dbg !36
231
+ %218 = extractelement <8 x i64> %194, i64 0, !dbg !37
232
+ %219 = mul i64 %218, %5, !dbg !37
233
+ %220 = extractelement <8 x i64> %194, i64 1, !dbg !37
234
+ %221 = mul i64 %220, %5, !dbg !37
235
+ %222 = extractelement <8 x i64> %194, i64 2, !dbg !37
236
+ %223 = mul i64 %222, %5, !dbg !37
237
+ %224 = extractelement <8 x i64> %194, i64 3, !dbg !37
238
+ %225 = mul i64 %224, %5, !dbg !37
239
+ %226 = extractelement <8 x i64> %194, i64 4, !dbg !37
240
+ %227 = mul i64 %226, %5, !dbg !37
241
+ %228 = extractelement <8 x i64> %194, i64 5, !dbg !37
242
+ %229 = mul i64 %228, %5, !dbg !37
243
+ %230 = extractelement <8 x i64> %194, i64 6, !dbg !37
244
+ %231 = mul i64 %230, %5, !dbg !37
245
+ %232 = extractelement <8 x i64> %194, i64 7, !dbg !37
246
+ %233 = mul i64 %232, %5, !dbg !37
247
+ %234 = getelementptr bfloat, ptr addrspace(1) %2, i64 %203, !dbg !38
248
+ %235 = getelementptr bfloat, ptr addrspace(1) %234, i64 %219, !dbg !38
249
+ %236 = getelementptr bfloat, ptr addrspace(1) %2, i64 %205, !dbg !38
250
+ %237 = getelementptr bfloat, ptr addrspace(1) %236, i64 %221, !dbg !38
251
+ %238 = getelementptr bfloat, ptr addrspace(1) %2, i64 %207, !dbg !38
252
+ %239 = getelementptr bfloat, ptr addrspace(1) %238, i64 %223, !dbg !38
253
+ %240 = getelementptr bfloat, ptr addrspace(1) %2, i64 %209, !dbg !38
254
+ %241 = getelementptr bfloat, ptr addrspace(1) %240, i64 %225, !dbg !38
255
+ %242 = getelementptr bfloat, ptr addrspace(1) %2, i64 %211, !dbg !38
256
+ %243 = getelementptr bfloat, ptr addrspace(1) %242, i64 %227, !dbg !38
257
+ %244 = getelementptr bfloat, ptr addrspace(1) %2, i64 %213, !dbg !38
258
+ %245 = getelementptr bfloat, ptr addrspace(1) %244, i64 %229, !dbg !38
259
+ %246 = getelementptr bfloat, ptr addrspace(1) %2, i64 %215, !dbg !38
260
+ %247 = getelementptr bfloat, ptr addrspace(1) %246, i64 %231, !dbg !38
261
+ %248 = getelementptr bfloat, ptr addrspace(1) %2, i64 %217, !dbg !38
262
+ %249 = getelementptr bfloat, ptr addrspace(1) %248, i64 %233, !dbg !38
263
+ %250 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39
264
+ %251 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %235, i64 %250, i1 %143) #4, !dbg !39
265
+ %252 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39
266
+ %253 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %237, i64 %252, i1 %146) #4, !dbg !39
267
+ %254 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39
268
+ %255 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %239, i64 %254, i1 %149) #4, !dbg !39
269
+ %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39
270
+ %257 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %241, i64 %256, i1 %152) #4, !dbg !39
271
+ %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39
272
+ %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %243, i64 %258, i1 %155) #4, !dbg !39
273
+ %260 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39
274
+ %261 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %245, i64 %260, i1 %158) #4, !dbg !39
275
+ %262 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39
276
+ %263 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %247, i64 %262, i1 %161) #4, !dbg !39
277
+ %264 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39
278
+ %265 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %249, i64 %264, i1 %164) #4, !dbg !39
279
+ %266 = icmp slt <8 x i64> %40, %140, !dbg !40
280
+ %267 = add i64 %5, %41, !dbg !41
281
+ %268 = add i64 %5, %43, !dbg !41
282
+ %269 = add i64 %5, %45, !dbg !41
283
+ %270 = add i64 %5, %47, !dbg !41
284
+ %271 = add i64 %5, %49, !dbg !41
285
+ %272 = add i64 %5, %51, !dbg !41
286
+ %273 = add i64 %5, %53, !dbg !41
287
+ %274 = add i64 %5, %55, !dbg !41
288
+ %275 = sub i64 %267, %121, !dbg !42
289
+ %276 = sub i64 %268, %121, !dbg !42
290
+ %277 = sub i64 %269, %121, !dbg !42
291
+ %278 = sub i64 %270, %121, !dbg !42
292
+ %279 = sub i64 %271, %121, !dbg !42
293
+ %280 = sub i64 %272, %121, !dbg !42
294
+ %281 = sub i64 %273, %121, !dbg !42
295
+ %282 = sub i64 %274, %121, !dbg !42
296
+ %283 = getelementptr bfloat, ptr addrspace(1) %0, i64 %275, !dbg !43
297
+ %284 = getelementptr bfloat, ptr addrspace(1) %0, i64 %276, !dbg !43
298
+ %285 = getelementptr bfloat, ptr addrspace(1) %0, i64 %277, !dbg !43
299
+ %286 = getelementptr bfloat, ptr addrspace(1) %0, i64 %278, !dbg !43
300
+ %287 = getelementptr bfloat, ptr addrspace(1) %0, i64 %279, !dbg !43
301
+ %288 = getelementptr bfloat, ptr addrspace(1) %0, i64 %280, !dbg !43
302
+ %289 = getelementptr bfloat, ptr addrspace(1) %0, i64 %281, !dbg !43
303
+ %290 = getelementptr bfloat, ptr addrspace(1) %0, i64 %282, !dbg !43
304
+ %291 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44
305
+ %292 = and <8 x i1> %36, %266, !dbg !45
306
+ %293 = extractelement <8 x i1> %292, i64 0, !dbg !46
307
+ %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %283, i64 %291, i1 %293) #4, !dbg !44
308
+ %295 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44
309
+ %296 = extractelement <8 x i1> %292, i64 1, !dbg !46
310
+ %297 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %284, i64 %295, i1 %296) #4, !dbg !44
311
+ %298 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44
312
+ %299 = extractelement <8 x i1> %292, i64 2, !dbg !46
313
+ %300 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %285, i64 %298, i1 %299) #4, !dbg !44
314
+ %301 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44
315
+ %302 = extractelement <8 x i1> %292, i64 3, !dbg !46
316
+ %303 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %301, i1 %302) #4, !dbg !44
317
+ %304 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44
318
+ %305 = extractelement <8 x i1> %292, i64 4, !dbg !46
319
+ %306 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %287, i64 %304, i1 %305) #4, !dbg !44
320
+ %307 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44
321
+ %308 = extractelement <8 x i1> %292, i64 5, !dbg !46
322
+ %309 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %307, i1 %308) #4, !dbg !44
323
+ %310 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44
324
+ %311 = extractelement <8 x i1> %292, i64 6, !dbg !46
325
+ %312 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %289, i64 %310, i1 %311) #4, !dbg !44
326
+ %313 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44
327
+ %314 = extractelement <8 x i1> %292, i64 7, !dbg !46
328
+ %315 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %313, i1 %314) #4, !dbg !44
329
+ %316 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46
330
+ %317 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %97, i64 %316, i1 %293) #4, !dbg !46
331
+ %318 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46
332
+ %319 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %98, i64 %318, i1 %296) #4, !dbg !46
333
+ %320 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46
334
+ %321 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %99, i64 %320, i1 %299) #4, !dbg !46
335
+ %322 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46
336
+ %323 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %100, i64 %322, i1 %302) #4, !dbg !46
337
+ %324 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46
338
+ %325 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %101, i64 %324, i1 %305) #4, !dbg !46
339
+ %326 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46
340
+ %327 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %102, i64 %326, i1 %308) #4, !dbg !46
341
+ %328 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46
342
+ %329 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %103, i64 %328, i1 %311) #4, !dbg !46
343
+ %330 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46
344
+ %331 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %330, i1 %314) #4, !dbg !46
345
+ %332 = insertelement <8 x i64> poison, i64 %317, i64 0, !dbg !47
346
+ %333 = insertelement <8 x i64> %332, i64 %319, i64 1, !dbg !47
347
+ %334 = insertelement <8 x i64> %333, i64 %321, i64 2, !dbg !47
348
+ %335 = insertelement <8 x i64> %334, i64 %323, i64 3, !dbg !47
349
+ %336 = insertelement <8 x i64> %335, i64 %325, i64 4, !dbg !47
350
+ %337 = insertelement <8 x i64> %336, i64 %327, i64 5, !dbg !47
351
+ %338 = insertelement <8 x i64> %337, i64 %329, i64 6, !dbg !47
352
+ %339 = insertelement <8 x i64> %338, i64 %331, i64 7, !dbg !47
353
+ %340 = icmp slt <8 x i64> %339, zeroinitializer, !dbg !47
354
+ %341 = select <8 x i1> %340, <8 x i64> %192, <8 x i64> zeroinitializer, !dbg !48
355
+ %342 = add <8 x i64> %341, %339, !dbg !48
356
+ %343 = icmp slt <8 x i64> %342, zeroinitializer, !dbg !49
357
+ %344 = icmp sge <8 x i64> %342, %192, !dbg !50
358
+ %345 = or <8 x i1> %343, %344, !dbg !51
359
+ %346 = and <8 x i1> %292, %345, !dbg !52
360
+ %347 = bitcast <8 x i1> %346 to i8, !dbg !53
361
+ %.not97 = icmp eq i8 %347, 0, !dbg !53
362
+ br i1 %.not97, label %349, label %348, !dbg !53
363
+
364
+ 348: ; preds = %201
365
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 51, ptr nonnull @assertFunc_1, i64 1), !dbg !53
366
+ unreachable, !dbg !53
367
+
368
+ 349: ; preds = %201
369
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
370
+ %350 = sub i64 %5, %121, !dbg !54
371
+ %351 = extractelement <8 x i64> %342, i64 0, !dbg !55
372
+ %352 = mul i64 %351, %5, !dbg !55
373
+ %353 = extractelement <8 x i64> %342, i64 1, !dbg !55
374
+ %354 = mul i64 %353, %5, !dbg !55
375
+ %355 = extractelement <8 x i64> %342, i64 2, !dbg !55
376
+ %356 = mul i64 %355, %5, !dbg !55
377
+ %357 = extractelement <8 x i64> %342, i64 3, !dbg !55
378
+ %358 = mul i64 %357, %5, !dbg !55
379
+ %359 = extractelement <8 x i64> %342, i64 4, !dbg !55
380
+ %360 = mul i64 %359, %5, !dbg !55
381
+ %361 = extractelement <8 x i64> %342, i64 5, !dbg !55
382
+ %362 = mul i64 %361, %5, !dbg !55
383
+ %363 = extractelement <8 x i64> %342, i64 6, !dbg !55
384
+ %364 = mul i64 %363, %5, !dbg !55
385
+ %365 = extractelement <8 x i64> %342, i64 7, !dbg !55
386
+ %366 = mul i64 %365, %5, !dbg !55
387
+ %367 = getelementptr bfloat, ptr addrspace(1) %2, i64 %350, !dbg !56
388
+ %368 = getelementptr bfloat, ptr addrspace(1) %367, i64 %202, !dbg !56
389
+ %369 = getelementptr bfloat, ptr addrspace(1) %368, i64 %352, !dbg !56
390
+ %370 = getelementptr bfloat, ptr addrspace(1) %367, i64 %204, !dbg !56
391
+ %371 = getelementptr bfloat, ptr addrspace(1) %370, i64 %354, !dbg !56
392
+ %372 = getelementptr bfloat, ptr addrspace(1) %367, i64 %206, !dbg !56
393
+ %373 = getelementptr bfloat, ptr addrspace(1) %372, i64 %356, !dbg !56
394
+ %374 = getelementptr bfloat, ptr addrspace(1) %367, i64 %208, !dbg !56
395
+ %375 = getelementptr bfloat, ptr addrspace(1) %374, i64 %358, !dbg !56
396
+ %376 = getelementptr bfloat, ptr addrspace(1) %367, i64 %210, !dbg !56
397
+ %377 = getelementptr bfloat, ptr addrspace(1) %376, i64 %360, !dbg !56
398
+ %378 = getelementptr bfloat, ptr addrspace(1) %367, i64 %212, !dbg !56
399
+ %379 = getelementptr bfloat, ptr addrspace(1) %378, i64 %362, !dbg !56
400
+ %380 = getelementptr bfloat, ptr addrspace(1) %367, i64 %214, !dbg !56
401
+ %381 = getelementptr bfloat, ptr addrspace(1) %380, i64 %364, !dbg !56
402
+ %382 = getelementptr bfloat, ptr addrspace(1) %367, i64 %216, !dbg !56
403
+ %383 = getelementptr bfloat, ptr addrspace(1) %382, i64 %366, !dbg !56
404
+ %384 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57
405
+ %385 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %369, i64 %384, i1 %293) #4, !dbg !57
406
+ %386 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57
407
+ %387 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %371, i64 %386, i1 %296) #4, !dbg !57
408
+ %388 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57
409
+ %389 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %388, i1 %299) #4, !dbg !57
410
+ %390 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57
411
+ %391 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %375, i64 %390, i1 %302) #4, !dbg !57
412
+ %392 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57
413
+ %393 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %377, i64 %392, i1 %305) #4, !dbg !57
414
+ %394 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57
415
+ %395 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %379, i64 %394, i1 %308) #4, !dbg !57
416
+ %396 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57
417
+ %397 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %381, i64 %396, i1 %311) #4, !dbg !57
418
+ %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57
419
+ %399 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %383, i64 %398, i1 %314) #4, !dbg !57
420
+ %400 = insertelement <8 x i64> poison, i64 %106, i64 0, !dbg !58
421
+ %401 = insertelement <8 x i64> %400, i64 %108, i64 1, !dbg !58
422
+ %402 = insertelement <8 x i64> %401, i64 %110, i64 2, !dbg !58
423
+ %403 = insertelement <8 x i64> %402, i64 %112, i64 3, !dbg !58
424
+ %404 = insertelement <8 x i64> %403, i64 %114, i64 4, !dbg !58
425
+ %405 = insertelement <8 x i64> %404, i64 %116, i64 5, !dbg !58
426
+ %406 = insertelement <8 x i64> %405, i64 %118, i64 6, !dbg !58
427
+ %407 = insertelement <8 x i64> %406, i64 %120, i64 7, !dbg !58
428
+ %408 = icmp slt <8 x i64> %407, zeroinitializer, !dbg !58
429
+ %409 = insertelement <8 x i64> poison, i64 %8, i64 0, !dbg !59
430
+ %410 = shufflevector <8 x i64> %409, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !59
431
+ %411 = select <8 x i1> %408, <8 x i64> %410, <8 x i64> zeroinitializer, !dbg !59
432
+ %412 = add <8 x i64> %411, %407, !dbg !59
433
+ %413 = icmp slt <8 x i64> %412, zeroinitializer, !dbg !60
434
+ %414 = icmp sge <8 x i64> %412, %410, !dbg !61
435
+ %415 = or <8 x i1> %413, %414, !dbg !62
436
+ %416 = and <8 x i1> %36, %415, !dbg !63
437
+ %417 = bitcast <8 x i1> %416 to i8, !dbg !64
438
+ %.not98 = icmp eq i8 %417, 0, !dbg !64
439
+ br i1 %.not98, label %419, label %418, !dbg !64
440
+
441
+ 418: ; preds = %349
442
+ tail call void @__assertfail(ptr nonnull @assertMessage_2, ptr nonnull @assertFile_2, i32 62, ptr nonnull @assertFunc_2, i64 1), !dbg !64
443
+ unreachable, !dbg !64
444
+
445
+ 419: ; preds = %349
446
+ %420 = bitcast i16 %165 to bfloat, !dbg !25
447
+ %421 = fpext bfloat %420 to float, !dbg !65
448
+ %422 = bitcast i16 %265 to bfloat, !dbg !39
449
+ %423 = fpext bfloat %422 to float, !dbg !66
450
+ %424 = fmul float %421, %423, !dbg !67
451
+ %425 = fsub float 0.000000e+00, %424, !dbg !68
452
+ %426 = extractelement <8 x i1> %141, i64 7, !dbg !69
453
+ %427 = select i1 %426, float %425, float 0.000000e+00, !dbg !69
454
+ %428 = bitcast i16 %315 to bfloat, !dbg !44
455
+ %429 = fpext bfloat %428 to float, !dbg !70
456
+ %430 = bitcast i16 %399 to bfloat, !dbg !57
457
+ %431 = fpext bfloat %430 to float, !dbg !71
458
+ %432 = fmul float %429, %431, !dbg !72
459
+ %433 = extractelement <8 x i1> %266, i64 7, !dbg !69
460
+ %434 = select i1 %433, float %432, float 0.000000e+00, !dbg !69
461
+ %435 = fadd float %427, %434, !dbg !73
462
+ %436 = bitcast i16 %162 to bfloat, !dbg !25
463
+ %437 = fpext bfloat %436 to float, !dbg !65
464
+ %438 = bitcast i16 %263 to bfloat, !dbg !39
465
+ %439 = fpext bfloat %438 to float, !dbg !66
466
+ %440 = fmul float %437, %439, !dbg !67
467
+ %441 = fsub float 0.000000e+00, %440, !dbg !68
468
+ %442 = extractelement <8 x i1> %141, i64 6, !dbg !69
469
+ %443 = select i1 %442, float %441, float 0.000000e+00, !dbg !69
470
+ %444 = bitcast i16 %312 to bfloat, !dbg !44
471
+ %445 = fpext bfloat %444 to float, !dbg !70
472
+ %446 = bitcast i16 %397 to bfloat, !dbg !57
473
+ %447 = fpext bfloat %446 to float, !dbg !71
474
+ %448 = fmul float %445, %447, !dbg !72
475
+ %449 = extractelement <8 x i1> %266, i64 6, !dbg !69
476
+ %450 = select i1 %449, float %448, float 0.000000e+00, !dbg !69
477
+ %451 = fadd float %443, %450, !dbg !73
478
+ %452 = bitcast i16 %159 to bfloat, !dbg !25
479
+ %453 = fpext bfloat %452 to float, !dbg !65
480
+ %454 = bitcast i16 %261 to bfloat, !dbg !39
481
+ %455 = fpext bfloat %454 to float, !dbg !66
482
+ %456 = fmul float %453, %455, !dbg !67
483
+ %457 = fsub float 0.000000e+00, %456, !dbg !68
484
+ %458 = extractelement <8 x i1> %141, i64 5, !dbg !69
485
+ %459 = select i1 %458, float %457, float 0.000000e+00, !dbg !69
486
+ %460 = bitcast i16 %309 to bfloat, !dbg !44
487
+ %461 = fpext bfloat %460 to float, !dbg !70
488
+ %462 = bitcast i16 %395 to bfloat, !dbg !57
489
+ %463 = fpext bfloat %462 to float, !dbg !71
490
+ %464 = fmul float %461, %463, !dbg !72
491
+ %465 = extractelement <8 x i1> %266, i64 5, !dbg !69
492
+ %466 = select i1 %465, float %464, float 0.000000e+00, !dbg !69
493
+ %467 = fadd float %459, %466, !dbg !73
494
+ %468 = bitcast i16 %156 to bfloat, !dbg !25
495
+ %469 = fpext bfloat %468 to float, !dbg !65
496
+ %470 = bitcast i16 %259 to bfloat, !dbg !39
497
+ %471 = fpext bfloat %470 to float, !dbg !66
498
+ %472 = fmul float %469, %471, !dbg !67
499
+ %473 = fsub float 0.000000e+00, %472, !dbg !68
500
+ %474 = extractelement <8 x i1> %141, i64 4, !dbg !69
501
+ %475 = select i1 %474, float %473, float 0.000000e+00, !dbg !69
502
+ %476 = bitcast i16 %306 to bfloat, !dbg !44
503
+ %477 = fpext bfloat %476 to float, !dbg !70
504
+ %478 = bitcast i16 %393 to bfloat, !dbg !57
505
+ %479 = fpext bfloat %478 to float, !dbg !71
506
+ %480 = fmul float %477, %479, !dbg !72
507
+ %481 = extractelement <8 x i1> %266, i64 4, !dbg !69
508
+ %482 = select i1 %481, float %480, float 0.000000e+00, !dbg !69
509
+ %483 = fadd float %475, %482, !dbg !73
510
+ %484 = bitcast i16 %153 to bfloat, !dbg !25
511
+ %485 = fpext bfloat %484 to float, !dbg !65
512
+ %486 = bitcast i16 %257 to bfloat, !dbg !39
513
+ %487 = fpext bfloat %486 to float, !dbg !66
514
+ %488 = fmul float %485, %487, !dbg !67
515
+ %489 = fsub float 0.000000e+00, %488, !dbg !68
516
+ %490 = extractelement <8 x i1> %141, i64 3, !dbg !69
517
+ %491 = select i1 %490, float %489, float 0.000000e+00, !dbg !69
518
+ %492 = bitcast i16 %303 to bfloat, !dbg !44
519
+ %493 = fpext bfloat %492 to float, !dbg !70
520
+ %494 = bitcast i16 %391 to bfloat, !dbg !57
521
+ %495 = fpext bfloat %494 to float, !dbg !71
522
+ %496 = fmul float %493, %495, !dbg !72
523
+ %497 = extractelement <8 x i1> %266, i64 3, !dbg !69
524
+ %498 = select i1 %497, float %496, float 0.000000e+00, !dbg !69
525
+ %499 = fadd float %491, %498, !dbg !73
526
+ %500 = bitcast i16 %150 to bfloat, !dbg !25
527
+ %501 = fpext bfloat %500 to float, !dbg !65
528
+ %502 = bitcast i16 %255 to bfloat, !dbg !39
529
+ %503 = fpext bfloat %502 to float, !dbg !66
530
+ %504 = fmul float %501, %503, !dbg !67
531
+ %505 = fsub float 0.000000e+00, %504, !dbg !68
532
+ %506 = extractelement <8 x i1> %141, i64 2, !dbg !69
533
+ %507 = select i1 %506, float %505, float 0.000000e+00, !dbg !69
534
+ %508 = bitcast i16 %300 to bfloat, !dbg !44
535
+ %509 = fpext bfloat %508 to float, !dbg !70
536
+ %510 = bitcast i16 %389 to bfloat, !dbg !57
537
+ %511 = fpext bfloat %510 to float, !dbg !71
538
+ %512 = fmul float %509, %511, !dbg !72
539
+ %513 = extractelement <8 x i1> %266, i64 2, !dbg !69
540
+ %514 = select i1 %513, float %512, float 0.000000e+00, !dbg !69
541
+ %515 = fadd float %507, %514, !dbg !73
542
+ %516 = bitcast i16 %147 to bfloat, !dbg !25
543
+ %517 = fpext bfloat %516 to float, !dbg !65
544
+ %518 = bitcast i16 %253 to bfloat, !dbg !39
545
+ %519 = fpext bfloat %518 to float, !dbg !66
546
+ %520 = fmul float %517, %519, !dbg !67
547
+ %521 = fsub float 0.000000e+00, %520, !dbg !68
548
+ %522 = extractelement <8 x i1> %141, i64 1, !dbg !69
549
+ %523 = select i1 %522, float %521, float 0.000000e+00, !dbg !69
550
+ %524 = bitcast i16 %297 to bfloat, !dbg !44
551
+ %525 = fpext bfloat %524 to float, !dbg !70
552
+ %526 = bitcast i16 %387 to bfloat, !dbg !57
553
+ %527 = fpext bfloat %526 to float, !dbg !71
554
+ %528 = fmul float %525, %527, !dbg !72
555
+ %529 = extractelement <8 x i1> %266, i64 1, !dbg !69
556
+ %530 = select i1 %529, float %528, float 0.000000e+00, !dbg !69
557
+ %531 = fadd float %523, %530, !dbg !73
558
+ %532 = bitcast i16 %144 to bfloat, !dbg !25
559
+ %533 = fpext bfloat %532 to float, !dbg !65
560
+ %534 = bitcast i16 %251 to bfloat, !dbg !39
561
+ %535 = fpext bfloat %534 to float, !dbg !66
562
+ %536 = fmul float %533, %535, !dbg !67
563
+ %537 = fsub float 0.000000e+00, %536, !dbg !68
564
+ %538 = extractelement <8 x i1> %141, i64 0, !dbg !69
565
+ %539 = select i1 %538, float %537, float 0.000000e+00, !dbg !69
566
+ %540 = bitcast i16 %294 to bfloat, !dbg !44
567
+ %541 = fpext bfloat %540 to float, !dbg !70
568
+ %542 = bitcast i16 %385 to bfloat, !dbg !57
569
+ %543 = fpext bfloat %542 to float, !dbg !71
570
+ %544 = fmul float %541, %543, !dbg !72
571
+ %545 = extractelement <8 x i1> %266, i64 0, !dbg !69
572
+ %546 = select i1 %545, float %544, float 0.000000e+00, !dbg !69
573
+ %547 = fadd float %539, %546, !dbg !73
574
+ %548 = bitcast i16 %96 to bfloat, !dbg !19
575
+ %549 = fpext bfloat %548 to float, !dbg !74
576
+ %550 = bitcast i16 %93 to bfloat, !dbg !19
577
+ %551 = fpext bfloat %550 to float, !dbg !74
578
+ %552 = bitcast i16 %90 to bfloat, !dbg !19
579
+ %553 = fpext bfloat %552 to float, !dbg !74
580
+ %554 = bitcast i16 %87 to bfloat, !dbg !19
581
+ %555 = fpext bfloat %554 to float, !dbg !74
582
+ %556 = bitcast i16 %84 to bfloat, !dbg !19
583
+ %557 = fpext bfloat %556 to float, !dbg !74
584
+ %558 = bitcast i16 %81 to bfloat, !dbg !19
585
+ %559 = fpext bfloat %558 to float, !dbg !74
586
+ %560 = bitcast i16 %78 to bfloat, !dbg !19
587
+ %561 = fpext bfloat %560 to float, !dbg !74
588
+ %562 = bitcast i16 %75 to bfloat, !dbg !19
589
+ %563 = fpext bfloat %562 to float, !dbg !74
590
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !64
591
+ %564 = extractelement <8 x i64> %412, i64 0, !dbg !75
592
+ %565 = mul i64 %564, %5, !dbg !75
593
+ %566 = extractelement <8 x i64> %412, i64 1, !dbg !75
594
+ %567 = mul i64 %566, %5, !dbg !75
595
+ %568 = extractelement <8 x i64> %412, i64 2, !dbg !75
596
+ %569 = mul i64 %568, %5, !dbg !75
597
+ %570 = extractelement <8 x i64> %412, i64 3, !dbg !75
598
+ %571 = mul i64 %570, %5, !dbg !75
599
+ %572 = extractelement <8 x i64> %412, i64 4, !dbg !75
600
+ %573 = mul i64 %572, %5, !dbg !75
601
+ %574 = extractelement <8 x i64> %412, i64 5, !dbg !75
602
+ %575 = mul i64 %574, %5, !dbg !75
603
+ %576 = extractelement <8 x i64> %412, i64 6, !dbg !75
604
+ %577 = mul i64 %576, %5, !dbg !75
605
+ %578 = extractelement <8 x i64> %412, i64 7, !dbg !75
606
+ %579 = mul i64 %578, %5, !dbg !75
607
+ %580 = getelementptr bfloat, ptr addrspace(1) %3, i64 %202, !dbg !76
608
+ %581 = getelementptr bfloat, ptr addrspace(1) %580, i64 %565, !dbg !76
609
+ %582 = getelementptr bfloat, ptr addrspace(1) %3, i64 %204, !dbg !76
610
+ %583 = getelementptr bfloat, ptr addrspace(1) %582, i64 %567, !dbg !76
611
+ %584 = getelementptr bfloat, ptr addrspace(1) %3, i64 %206, !dbg !76
612
+ %585 = getelementptr bfloat, ptr addrspace(1) %584, i64 %569, !dbg !76
613
+ %586 = getelementptr bfloat, ptr addrspace(1) %3, i64 %208, !dbg !76
614
+ %587 = getelementptr bfloat, ptr addrspace(1) %586, i64 %571, !dbg !76
615
+ %588 = getelementptr bfloat, ptr addrspace(1) %3, i64 %210, !dbg !76
616
+ %589 = getelementptr bfloat, ptr addrspace(1) %588, i64 %573, !dbg !76
617
+ %590 = getelementptr bfloat, ptr addrspace(1) %3, i64 %212, !dbg !76
618
+ %591 = getelementptr bfloat, ptr addrspace(1) %590, i64 %575, !dbg !76
619
+ %592 = getelementptr bfloat, ptr addrspace(1) %3, i64 %214, !dbg !76
620
+ %593 = getelementptr bfloat, ptr addrspace(1) %592, i64 %577, !dbg !76
621
+ %594 = getelementptr bfloat, ptr addrspace(1) %3, i64 %216, !dbg !76
622
+ %595 = getelementptr bfloat, ptr addrspace(1) %594, i64 %579, !dbg !76
623
+ %596 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77
624
+ %597 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %581, i64 %596, i1 %74) #4, !dbg !77
625
+ %598 = bitcast i16 %597 to bfloat, !dbg !77
626
+ %599 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77
627
+ %600 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %583, i64 %599, i1 %77) #4, !dbg !77
628
+ %601 = bitcast i16 %600 to bfloat, !dbg !77
629
+ %602 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77
630
+ %603 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %585, i64 %602, i1 %80) #4, !dbg !77
631
+ %604 = bitcast i16 %603 to bfloat, !dbg !77
632
+ %605 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77
633
+ %606 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %587, i64 %605, i1 %83) #4, !dbg !77
634
+ %607 = bitcast i16 %606 to bfloat, !dbg !77
635
+ %608 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77
636
+ %609 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %589, i64 %608, i1 %86) #4, !dbg !77
637
+ %610 = bitcast i16 %609 to bfloat, !dbg !77
638
+ %611 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77
639
+ %612 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %591, i64 %611, i1 %89) #4, !dbg !77
640
+ %613 = bitcast i16 %612 to bfloat, !dbg !77
641
+ %614 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77
642
+ %615 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %593, i64 %614, i1 %92) #4, !dbg !77
643
+ %616 = bitcast i16 %615 to bfloat, !dbg !77
644
+ %617 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77
645
+ %618 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %595, i64 %617, i1 %95) #4, !dbg !77
646
+ %619 = bitcast i16 %618 to bfloat, !dbg !77
647
+ %620 = fpext bfloat %598 to float, !dbg !78
648
+ %621 = fpext bfloat %601 to float, !dbg !78
649
+ %622 = fpext bfloat %604 to float, !dbg !78
650
+ %623 = fpext bfloat %607 to float, !dbg !78
651
+ %624 = fpext bfloat %610 to float, !dbg !78
652
+ %625 = fpext bfloat %613 to float, !dbg !78
653
+ %626 = fpext bfloat %616 to float, !dbg !78
654
+ %627 = fpext bfloat %619 to float, !dbg !78
655
+ %628 = fmul float %563, %620, !dbg !79
656
+ %629 = fmul float %561, %621, !dbg !79
657
+ %630 = fmul float %559, %622, !dbg !79
658
+ %631 = fmul float %557, %623, !dbg !79
659
+ %632 = fmul float %555, %624, !dbg !79
660
+ %633 = fmul float %553, %625, !dbg !79
661
+ %634 = fmul float %551, %626, !dbg !79
662
+ %635 = fmul float %549, %627, !dbg !79
663
+ %636 = fadd float %547, %628, !dbg !80
664
+ %637 = fadd float %531, %629, !dbg !80
665
+ %638 = fadd float %515, %630, !dbg !80
666
+ %639 = fadd float %499, %631, !dbg !80
667
+ %640 = fadd float %483, %632, !dbg !80
668
+ %641 = fadd float %467, %633, !dbg !80
669
+ %642 = fadd float %451, %634, !dbg !80
670
+ %643 = fadd float %435, %635, !dbg !80
671
+ %644 = getelementptr bfloat, ptr addrspace(1) %4, i64 %41, !dbg !81
672
+ %645 = getelementptr bfloat, ptr addrspace(1) %4, i64 %43, !dbg !81
673
+ %646 = getelementptr bfloat, ptr addrspace(1) %4, i64 %45, !dbg !81
674
+ %647 = getelementptr bfloat, ptr addrspace(1) %4, i64 %47, !dbg !81
675
+ %648 = getelementptr bfloat, ptr addrspace(1) %4, i64 %49, !dbg !81
676
+ %649 = getelementptr bfloat, ptr addrspace(1) %4, i64 %51, !dbg !81
677
+ %650 = getelementptr bfloat, ptr addrspace(1) %4, i64 %53, !dbg !81
678
+ %651 = getelementptr bfloat, ptr addrspace(1) %4, i64 %55, !dbg !81
679
+ %652 = fptrunc float %636 to bfloat, !dbg !82
680
+ %653 = fptrunc float %637 to bfloat, !dbg !82
681
+ %654 = fptrunc float %638 to bfloat, !dbg !82
682
+ %655 = fptrunc float %639 to bfloat, !dbg !82
683
+ %656 = fptrunc float %640 to bfloat, !dbg !82
684
+ %657 = fptrunc float %641 to bfloat, !dbg !82
685
+ %658 = fptrunc float %642 to bfloat, !dbg !82
686
+ %659 = fptrunc float %643 to bfloat, !dbg !82
687
+ %660 = bitcast bfloat %652 to i16, !dbg !82
688
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %660, ptr addrspace(1) %644, i1 %74) #4, !dbg !82
689
+ %661 = bitcast bfloat %653 to i16, !dbg !82
690
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %661, ptr addrspace(1) %645, i1 %77) #4, !dbg !82
691
+ %662 = bitcast bfloat %654 to i16, !dbg !82
692
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %662, ptr addrspace(1) %646, i1 %80) #4, !dbg !82
693
+ %663 = bitcast bfloat %655 to i16, !dbg !82
694
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %663, ptr addrspace(1) %647, i1 %83) #4, !dbg !82
695
+ %664 = bitcast bfloat %656 to i16, !dbg !82
696
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %664, ptr addrspace(1) %648, i1 %86) #4, !dbg !82
697
+ %665 = bitcast bfloat %657 to i16, !dbg !82
698
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %665, ptr addrspace(1) %649, i1 %89) #4, !dbg !82
699
+ %666 = bitcast bfloat %658 to i16, !dbg !82
700
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %666, ptr addrspace(1) %650, i1 %92) #4, !dbg !82
701
+ %667 = bitcast bfloat %659 to i16, !dbg !82
702
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %667, ptr addrspace(1) %651, i1 %95) #4, !dbg !82
703
+ ret void, !dbg !83
704
+ }
705
+
706
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
707
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
708
+
709
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
710
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
711
+
712
+ ; Function Attrs: convergent nocallback nounwind
713
+ declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
714
+
715
+ attributes #0 = { noreturn }
716
+ attributes #1 = { "nvvm.reqntid"="128" }
717
+ attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
718
+ attributes #3 = { convergent nocallback nounwind }
719
+ attributes #4 = { nounwind }
720
+
721
+ !llvm.dbg.cu = !{!0}
722
+ !llvm.module.flags = !{!2, !3}
723
+ !llvm.ident = !{!4}
724
+
725
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
726
+ !1 = !DIFile(filename: "cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb")
727
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
728
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
729
+ !4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
730
+ !5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized)
731
+ !6 = !DIFile(filename: "<unknown>", directory: "")
732
+ !7 = !DISubroutineType(cc: DW_CC_normal, types: !8)
733
+ !8 = !{}
734
+ !9 = distinct !DISubprogram(name: "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0", linkageName: "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
735
+ !10 = !DILocation(line: 19, column: 28, scope: !9)
736
+ !11 = !DILocation(line: 19, column: 33, scope: !9)
737
+ !12 = !DILocation(line: 20, column: 36, scope: !9)
738
+ !13 = !DILocation(line: 21, column: 21, scope: !9)
739
+ !14 = !DILocation(line: 20, column: 23, scope: !9)
740
+ !15 = !DILocation(line: 22, column: 19, scope: !9)
741
+ !16 = !DILocation(line: 25, column: 31, scope: !9)
742
+ !17 = !DILocation(line: 24, column: 21, scope: !9)
743
+ !18 = !DILocation(line: 24, column: 28, scope: !9)
744
+ !19 = !DILocation(line: 25, column: 36, scope: !9)
745
+ !20 = !DILocation(line: 26, column: 31, scope: !9)
746
+ !21 = !DILocation(line: 26, column: 36, scope: !9)
747
+ !22 = !DILocation(line: 28, column: 18, scope: !9)
748
+ !23 = !DILocation(line: 30, column: 35, scope: !9)
749
+ !24 = !DILocation(line: 30, column: 30, scope: !9)
750
+ !25 = !DILocation(line: 30, column: 53, scope: !9)
751
+ !26 = !DILocation(line: 29, column: 19, scope: !9)
752
+ !27 = !DILocation(line: 30, column: 60, scope: !9)
753
+ !28 = !DILocation(line: 31, column: 35, scope: !9)
754
+ !29 = !DILocation(line: 34, column: 18, scope: !9)
755
+ !30 = !DILocation(line: 35, column: 32, scope: !9)
756
+ !31 = !DILocation(line: 36, column: 28, scope: !9)
757
+ !32 = !DILocation(line: 36, column: 98, scope: !9)
758
+ !33 = !DILocation(line: 36, column: 64, scope: !9)
759
+ !34 = !DILocation(line: 36, column: 106, scope: !9)
760
+ !35 = !DILocation(line: 36, column: 123, scope: !9)
761
+ !36 = !DILocation(line: 37, column: 36, scope: !9)
762
+ !37 = !DILocation(line: 37, column: 58, scope: !9)
763
+ !38 = !DILocation(line: 37, column: 31, scope: !9)
764
+ !39 = !DILocation(line: 37, column: 65, scope: !9)
765
+ !40 = !DILocation(line: 44, column: 19, scope: !9)
766
+ !41 = !DILocation(line: 45, column: 37, scope: !9)
767
+ !42 = !DILocation(line: 45, column: 42, scope: !9)
768
+ !43 = !DILocation(line: 45, column: 31, scope: !9)
769
+ !44 = !DILocation(line: 45, column: 60, scope: !9)
770
+ !45 = !DILocation(line: 45, column: 68, scope: !9)
771
+ !46 = !DILocation(line: 46, column: 36, scope: !9)
772
+ !47 = !DILocation(line: 49, column: 20, scope: !9)
773
+ !48 = !DILocation(line: 50, column: 35, scope: !9)
774
+ !49 = !DILocation(line: 51, column: 28, scope: !9)
775
+ !50 = !DILocation(line: 51, column: 100, scope: !9)
776
+ !51 = !DILocation(line: 51, column: 65, scope: !9)
777
+ !52 = !DILocation(line: 51, column: 108, scope: !9)
778
+ !53 = !DILocation(line: 51, column: 126, scope: !9)
779
+ !54 = !DILocation(line: 52, column: 37, scope: !9)
780
+ !55 = !DILocation(line: 52, column: 64, scope: !9)
781
+ !56 = !DILocation(line: 52, column: 31, scope: !9)
782
+ !57 = !DILocation(line: 52, column: 72, scope: !9)
783
+ !58 = !DILocation(line: 60, column: 20, scope: !9)
784
+ !59 = !DILocation(line: 61, column: 35, scope: !9)
785
+ !60 = !DILocation(line: 62, column: 28, scope: !9)
786
+ !61 = !DILocation(line: 62, column: 46, scope: !9)
787
+ !62 = !DILocation(line: 62, column: 38, scope: !9)
788
+ !63 = !DILocation(line: 62, column: 54, scope: !9)
789
+ !64 = !DILocation(line: 62, column: 64, scope: !9)
790
+ !65 = !DILocation(line: 30, column: 111, scope: !9)
791
+ !66 = !DILocation(line: 37, column: 123, scope: !9)
792
+ !67 = !DILocation(line: 38, column: 19, scope: !9)
793
+ !68 = !DILocation(line: 39, column: 13, scope: !9)
794
+ !69 = !DILocation(line: 0, scope: !9)
795
+ !70 = !DILocation(line: 45, column: 119, scope: !9)
796
+ !71 = !DILocation(line: 52, column: 131, scope: !9)
797
+ !72 = !DILocation(line: 53, column: 20, scope: !9)
798
+ !73 = !DILocation(line: 57, column: 20, scope: !9)
799
+ !74 = !DILocation(line: 25, column: 76, scope: !9)
800
+ !75 = !DILocation(line: 63, column: 40, scope: !9)
801
+ !76 = !DILocation(line: 63, column: 31, scope: !9)
802
+ !77 = !DILocation(line: 63, column: 48, scope: !9)
803
+ !78 = !DILocation(line: 63, column: 88, scope: !9)
804
+ !79 = !DILocation(line: 64, column: 20, scope: !9)
805
+ !80 = !DILocation(line: 65, column: 20, scope: !9)
806
+ !81 = !DILocation(line: 66, column: 25, scope: !9)
807
+ !82 = !DILocation(line: 66, column: 37, scope: !9)
808
+ !83 = !DILocation(line: 66, column: 4, scope: !9)
SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx ADDED
@@ -0,0 +1,1936 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 // -- Begin function triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ .noreturn;
19
+ .global .align 1 .b8 assertFunc_2[8] = {117, 110, 107, 110, 111, 119, 110};
20
+ .global .align 1 .b8 assertFile_2[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 109, 98, 47, 99, 109, 98, 53, 122, 108, 108, 100, 107, 114, 102, 108, 101, 121, 104, 99, 107, 104, 105, 111, 109, 104, 112, 102, 115, 118, 107, 102, 107, 108, 108, 111, 116, 98, 111, 112, 97, 108, 100, 100, 119, 107, 100, 102, 104, 108, 105, 122, 118, 101, 117, 120, 46, 112, 121};
21
+ .global .align 1 .b8 assertMessage_2[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 54, 32, 60, 32, 107, 115, 51};
22
+ .global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110};
23
+ .global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 109, 98, 47, 99, 109, 98, 53, 122, 108, 108, 100, 107, 114, 102, 108, 101, 121, 104, 99, 107, 104, 105, 111, 109, 104, 112, 102, 115, 118, 107, 102, 107, 108, 108, 111, 116, 98, 111, 112, 97, 108, 100, 100, 119, 107, 100, 102, 104, 108, 105, 122, 118, 101, 117, 120, 46, 112, 121};
24
+ .global .align 1 .b8 assertMessage_1[65] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 50, 51, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50};
25
+ .global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110};
26
+ .global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 109, 98, 47, 99, 109, 98, 53, 122, 108, 108, 100, 107, 114, 102, 108, 101, 121, 104, 99, 107, 104, 105, 111, 109, 104, 112, 102, 115, 118, 107, 102, 107, 108, 108, 111, 116, 98, 111, 112, 97, 108, 100, 100, 119, 107, 100, 102, 104, 108, 105, 122, 118, 101, 117, 120, 46, 112, 121};
27
+ .global .align 1 .b8 assertMessage_0[64] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 56, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50};
28
+ // @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0
29
+ .visible .entry triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(
30
+ .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_0,
31
+ .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_1,
32
+ .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_2,
33
+ .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_3,
34
+ .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_4,
35
+ .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_5,
36
+ .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_6,
37
+ .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_7,
38
+ .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_8,
39
+ .param .u32 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_9,
40
+ .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_10,
41
+ .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_11
42
+ )
43
+ .reqntid 128
44
+ {
45
+ .reg .pred %p<268>;
46
+ .reg .b16 %rs<207>;
47
+ .reg .b32 %r<174>;
48
+ .reg .b64 %rd<682>;
49
+ .loc 1 18 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:18:0
50
+ $L__func_begin0:
51
+ .loc 1 18 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:18:0
52
+
53
+ // %bb.0:
54
+ ld.param.b64 %rd143, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_5];
55
+ $L__tmp0:
56
+ .loc 1 19 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:19:28
57
+ mov.u32 %r17, %ctaid.x;
58
+ .loc 1 19 33 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:19:33
59
+ shl.b32 %r18, %r17, 10;
60
+ .loc 1 20 36 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:20:36
61
+ mov.u32 %r19, %tid.x;
62
+ shl.b32 %r20, %r19, 3;
63
+ and.b32 %r21, %r20, 1016;
64
+ .loc 1 20 23 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:20:23
65
+ or.b32 %r9, %r21, %r18;
66
+ or.b32 %r10, %r9, 1;
67
+ or.b32 %r11, %r9, 2;
68
+ .loc 1 22 19 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:22:19
69
+ cvt.s64.s32 %rd2, %r10;
70
+ cvt.s64.s32 %rd1, %r9;
71
+ .loc 1 24 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:21
72
+ or.b64 %rd147, %rd1, %rd143;
73
+ and.b64 %rd148, %rd147, -4294967296;
74
+ setp.ne.b64 %p25, %rd148, 0;
75
+ @%p25 bra $L__BB0_2;
76
+ bra.uni $L__BB0_1;
77
+ $L__BB0_2:
78
+ div.s64 %rd666, %rd1, %rd143;
79
+ bra.uni $L__BB0_3;
80
+ $L__BB0_1:
81
+ cvt.u32.u64 %r22, %rd143;
82
+ cvt.u32.u64 %r23, %rd1;
83
+ div.u32 %r24, %r23, %r22;
84
+ cvt.u64.u32 %rd666, %r24;
85
+ $L__BB0_3:
86
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
87
+ or.b32 %r12, %r9, 3;
88
+ cvt.s64.s32 %rd3, %r11;
89
+ .loc 1 24 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:21
90
+ or.b64 %rd149, %rd2, %rd143;
91
+ and.b64 %rd150, %rd149, -4294967296;
92
+ setp.ne.b64 %p26, %rd150, 0;
93
+ @%p26 bra $L__BB0_5;
94
+ bra.uni $L__BB0_4;
95
+ $L__BB0_5:
96
+ div.s64 %rd667, %rd2, %rd143;
97
+ bra.uni $L__BB0_6;
98
+ $L__BB0_4:
99
+ cvt.u32.u64 %r25, %rd143;
100
+ cvt.u32.u64 %r26, %rd2;
101
+ div.u32 %r27, %r26, %r25;
102
+ cvt.u64.u32 %rd667, %r27;
103
+ $L__BB0_6:
104
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
105
+ or.b32 %r13, %r9, 4;
106
+ cvt.s64.s32 %rd4, %r12;
107
+ .loc 1 24 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:21
108
+ or.b64 %rd151, %rd3, %rd143;
109
+ and.b64 %rd152, %rd151, -4294967296;
110
+ setp.ne.b64 %p27, %rd152, 0;
111
+ @%p27 bra $L__BB0_8;
112
+ bra.uni $L__BB0_7;
113
+ $L__BB0_8:
114
+ div.s64 %rd668, %rd3, %rd143;
115
+ bra.uni $L__BB0_9;
116
+ $L__BB0_7:
117
+ cvt.u32.u64 %r28, %rd143;
118
+ cvt.u32.u64 %r29, %rd3;
119
+ div.u32 %r30, %r29, %r28;
120
+ cvt.u64.u32 %rd668, %r30;
121
+ $L__BB0_9:
122
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
123
+ or.b32 %r14, %r9, 5;
124
+ cvt.s64.s32 %rd5, %r13;
125
+ .loc 1 24 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:21
126
+ or.b64 %rd153, %rd4, %rd143;
127
+ and.b64 %rd154, %rd153, -4294967296;
128
+ setp.ne.b64 %p28, %rd154, 0;
129
+ @%p28 bra $L__BB0_11;
130
+ bra.uni $L__BB0_10;
131
+ $L__BB0_11:
132
+ div.s64 %rd669, %rd4, %rd143;
133
+ bra.uni $L__BB0_12;
134
+ $L__BB0_10:
135
+ cvt.u32.u64 %r31, %rd143;
136
+ cvt.u32.u64 %r32, %rd4;
137
+ div.u32 %r33, %r32, %r31;
138
+ cvt.u64.u32 %rd669, %r33;
139
+ $L__BB0_12:
140
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
141
+ or.b32 %r15, %r9, 6;
142
+ cvt.s64.s32 %rd6, %r14;
143
+ .loc 1 24 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:21
144
+ or.b64 %rd155, %rd5, %rd143;
145
+ and.b64 %rd156, %rd155, -4294967296;
146
+ setp.ne.b64 %p29, %rd156, 0;
147
+ @%p29 bra $L__BB0_14;
148
+ bra.uni $L__BB0_13;
149
+ $L__BB0_14:
150
+ div.s64 %rd670, %rd5, %rd143;
151
+ bra.uni $L__BB0_15;
152
+ $L__BB0_13:
153
+ cvt.u32.u64 %r34, %rd143;
154
+ cvt.u32.u64 %r35, %rd5;
155
+ div.u32 %r36, %r35, %r34;
156
+ cvt.u64.u32 %rd670, %r36;
157
+ $L__BB0_15:
158
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
159
+ or.b32 %r16, %r9, 7;
160
+ cvt.s64.s32 %rd7, %r15;
161
+ .loc 1 24 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:21
162
+ or.b64 %rd157, %rd6, %rd143;
163
+ and.b64 %rd158, %rd157, -4294967296;
164
+ setp.ne.b64 %p30, %rd158, 0;
165
+ @%p30 bra $L__BB0_17;
166
+ bra.uni $L__BB0_16;
167
+ $L__BB0_17:
168
+ div.s64 %rd671, %rd6, %rd143;
169
+ bra.uni $L__BB0_18;
170
+ $L__BB0_16:
171
+ cvt.u32.u64 %r37, %rd143;
172
+ cvt.u32.u64 %r38, %rd6;
173
+ div.u32 %r39, %r38, %r37;
174
+ cvt.u64.u32 %rd671, %r39;
175
+ $L__BB0_18:
176
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
177
+ cvt.s64.s32 %rd8, %r16;
178
+ .loc 1 24 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:21
179
+ or.b64 %rd159, %rd7, %rd143;
180
+ and.b64 %rd160, %rd159, -4294967296;
181
+ setp.ne.b64 %p31, %rd160, 0;
182
+ @%p31 bra $L__BB0_20;
183
+ bra.uni $L__BB0_19;
184
+ $L__BB0_20:
185
+ div.s64 %rd672, %rd7, %rd143;
186
+ bra.uni $L__BB0_21;
187
+ $L__BB0_19:
188
+ cvt.u32.u64 %r40, %rd143;
189
+ cvt.u32.u64 %r41, %rd7;
190
+ div.u32 %r42, %r41, %r40;
191
+ cvt.u64.u32 %rd672, %r42;
192
+ $L__BB0_21:
193
+ .loc 1 0 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0:21
194
+ ld.param.b64 %rd144, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_6];
195
+ .loc 1 24 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:21
196
+ or.b64 %rd161, %rd8, %rd143;
197
+ and.b64 %rd162, %rd161, -4294967296;
198
+ setp.ne.b64 %p32, %rd162, 0;
199
+ @%p32 bra $L__BB0_23;
200
+ bra.uni $L__BB0_22;
201
+ $L__BB0_23:
202
+ div.s64 %rd673, %rd8, %rd143;
203
+ bra.uni $L__BB0_24;
204
+ $L__BB0_22:
205
+ cvt.u32.u64 %r43, %rd143;
206
+ cvt.u32.u64 %r44, %rd8;
207
+ div.u32 %r45, %r44, %r43;
208
+ cvt.u64.u32 %rd673, %r45;
209
+ $L__BB0_24:
210
+ .loc 1 24 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:28
211
+ or.b64 %rd163, %rd666, %rd144;
212
+ and.b64 %rd164, %rd163, -4294967296;
213
+ setp.ne.b64 %p33, %rd164, 0;
214
+ @%p33 bra $L__BB0_26;
215
+ bra.uni $L__BB0_25;
216
+ $L__BB0_26:
217
+ rem.s64 %rd674, %rd666, %rd144;
218
+ bra.uni $L__BB0_27;
219
+ $L__BB0_25:
220
+ cvt.u32.u64 %r46, %rd144;
221
+ cvt.u32.u64 %r47, %rd666;
222
+ rem.u32 %r48, %r47, %r46;
223
+ cvt.u64.u32 %rd674, %r48;
224
+ $L__BB0_27:
225
+ or.b64 %rd165, %rd667, %rd144;
226
+ and.b64 %rd166, %rd165, -4294967296;
227
+ setp.ne.b64 %p34, %rd166, 0;
228
+ @%p34 bra $L__BB0_29;
229
+ bra.uni $L__BB0_28;
230
+ $L__BB0_29:
231
+ rem.s64 %rd675, %rd667, %rd144;
232
+ bra.uni $L__BB0_30;
233
+ $L__BB0_28:
234
+ cvt.u32.u64 %r49, %rd144;
235
+ cvt.u32.u64 %r50, %rd667;
236
+ rem.u32 %r51, %r50, %r49;
237
+ cvt.u64.u32 %rd675, %r51;
238
+ $L__BB0_30:
239
+ or.b64 %rd167, %rd668, %rd144;
240
+ and.b64 %rd168, %rd167, -4294967296;
241
+ setp.ne.b64 %p35, %rd168, 0;
242
+ @%p35 bra $L__BB0_32;
243
+ bra.uni $L__BB0_31;
244
+ $L__BB0_32:
245
+ rem.s64 %rd676, %rd668, %rd144;
246
+ bra.uni $L__BB0_33;
247
+ $L__BB0_31:
248
+ cvt.u32.u64 %r52, %rd144;
249
+ cvt.u32.u64 %r53, %rd668;
250
+ rem.u32 %r54, %r53, %r52;
251
+ cvt.u64.u32 %rd676, %r54;
252
+ $L__BB0_33:
253
+ or.b64 %rd169, %rd669, %rd144;
254
+ and.b64 %rd170, %rd169, -4294967296;
255
+ setp.ne.b64 %p36, %rd170, 0;
256
+ @%p36 bra $L__BB0_35;
257
+ bra.uni $L__BB0_34;
258
+ $L__BB0_35:
259
+ rem.s64 %rd677, %rd669, %rd144;
260
+ bra.uni $L__BB0_36;
261
+ $L__BB0_34:
262
+ cvt.u32.u64 %r55, %rd144;
263
+ cvt.u32.u64 %r56, %rd669;
264
+ rem.u32 %r57, %r56, %r55;
265
+ cvt.u64.u32 %rd677, %r57;
266
+ $L__BB0_36:
267
+ or.b64 %rd171, %rd670, %rd144;
268
+ and.b64 %rd172, %rd171, -4294967296;
269
+ setp.ne.b64 %p37, %rd172, 0;
270
+ @%p37 bra $L__BB0_38;
271
+ bra.uni $L__BB0_37;
272
+ $L__BB0_38:
273
+ rem.s64 %rd678, %rd670, %rd144;
274
+ bra.uni $L__BB0_39;
275
+ $L__BB0_37:
276
+ cvt.u32.u64 %r58, %rd144;
277
+ cvt.u32.u64 %r59, %rd670;
278
+ rem.u32 %r60, %r59, %r58;
279
+ cvt.u64.u32 %rd678, %r60;
280
+ $L__BB0_39:
281
+ or.b64 %rd173, %rd671, %rd144;
282
+ and.b64 %rd174, %rd173, -4294967296;
283
+ setp.ne.b64 %p38, %rd174, 0;
284
+ @%p38 bra $L__BB0_41;
285
+ bra.uni $L__BB0_40;
286
+ $L__BB0_41:
287
+ rem.s64 %rd679, %rd671, %rd144;
288
+ bra.uni $L__BB0_42;
289
+ $L__BB0_40:
290
+ cvt.u32.u64 %r61, %rd144;
291
+ cvt.u32.u64 %r62, %rd671;
292
+ rem.u32 %r63, %r62, %r61;
293
+ cvt.u64.u32 %rd679, %r63;
294
+ $L__BB0_42:
295
+ or.b64 %rd175, %rd672, %rd144;
296
+ and.b64 %rd176, %rd175, -4294967296;
297
+ setp.ne.b64 %p39, %rd176, 0;
298
+ @%p39 bra $L__BB0_44;
299
+ bra.uni $L__BB0_43;
300
+ $L__BB0_44:
301
+ rem.s64 %rd680, %rd672, %rd144;
302
+ bra.uni $L__BB0_45;
303
+ $L__BB0_43:
304
+ cvt.u32.u64 %r64, %rd144;
305
+ cvt.u32.u64 %r65, %rd672;
306
+ rem.u32 %r66, %r65, %r64;
307
+ cvt.u64.u32 %rd680, %r66;
308
+ $L__BB0_45:
309
+ .loc 1 0 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0:28
310
+ ld.param.b64 %rd145, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_7];
311
+ ld.param.b64 %rd139, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_1];
312
+ ld.param.b64 %rd138, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_0];
313
+ ld.param.b32 %r8, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_9];
314
+ rem.s64 %rd9, %rd1, %rd143;
315
+ rem.s64 %rd16, %rd8, %rd143;
316
+ rem.s64 %rd15, %rd7, %rd143;
317
+ rem.s64 %rd14, %rd6, %rd143;
318
+ rem.s64 %rd13, %rd5, %rd143;
319
+ rem.s64 %rd12, %rd4, %rd143;
320
+ rem.s64 %rd11, %rd3, %rd143;
321
+ rem.s64 %rd10, %rd2, %rd143;
322
+ .loc 1 24 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:28
323
+ or.b64 %rd177, %rd673, %rd144;
324
+ and.b64 %rd178, %rd177, -4294967296;
325
+ setp.ne.b64 %p40, %rd178, 0;
326
+ @%p40 bra $L__BB0_47;
327
+ bra.uni $L__BB0_46;
328
+ $L__BB0_47:
329
+ rem.s64 %rd681, %rd673, %rd144;
330
+ bra.uni $L__BB0_48;
331
+ $L__BB0_46:
332
+ cvt.u32.u64 %r67, %rd144;
333
+ cvt.u32.u64 %r68, %rd673;
334
+ rem.u32 %r69, %r68, %r67;
335
+ cvt.u64.u32 %rd681, %r69;
336
+ $L__BB0_48:
337
+ .loc 1 21 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:21:21
338
+ setp.lt.s32 %p8, %r16, %r8;
339
+ setp.lt.s32 %p7, %r15, %r8;
340
+ setp.lt.s32 %p6, %r14, %r8;
341
+ setp.lt.s32 %p5, %r13, %r8;
342
+ setp.lt.s32 %p4, %r12, %r8;
343
+ setp.lt.s32 %p3, %r11, %r8;
344
+ setp.lt.s32 %p2, %r10, %r8;
345
+ setp.lt.s32 %p1, %r9, %r8;
346
+ .loc 1 25 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:25:31
347
+ shl.b64 %rd291, %rd1, 1;
348
+ add.s64 %rd180, %rd138, %rd291;
349
+ shl.b64 %rd292, %rd2, 1;
350
+ add.s64 %rd183, %rd138, %rd292;
351
+ shl.b64 %rd293, %rd3, 1;
352
+ add.s64 %rd186, %rd138, %rd293;
353
+ shl.b64 %rd294, %rd4, 1;
354
+ add.s64 %rd189, %rd138, %rd294;
355
+ shl.b64 %rd295, %rd5, 1;
356
+ add.s64 %rd192, %rd138, %rd295;
357
+ shl.b64 %rd296, %rd6, 1;
358
+ add.s64 %rd195, %rd138, %rd296;
359
+ shl.b64 %rd297, %rd7, 1;
360
+ add.s64 %rd198, %rd138, %rd297;
361
+ shl.b64 %rd298, %rd8, 1;
362
+ add.s64 %rd201, %rd138, %rd298;
363
+ .loc 1 25 36 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:25:36
364
+ // begin inline asm
365
+ mov.u64 %rd179, 0x0;
366
+ createpolicy.fractional.L2::evict_last.b64 %rd179, 1.0;
367
+ // end inline asm
368
+ // begin inline asm
369
+ mov.u16 %rs41, 0x0;
370
+ @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs41 }, [ %rd180 + 0 ], %rd179;
371
+ // end inline asm
372
+ // begin inline asm
373
+ mov.u64 %rd182, 0x0;
374
+ createpolicy.fractional.L2::evict_last.b64 %rd182, 1.0;
375
+ // end inline asm
376
+ // begin inline asm
377
+ mov.u16 %rs42, 0x0;
378
+ @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs42 }, [ %rd183 + 0 ], %rd182;
379
+ // end inline asm
380
+ // begin inline asm
381
+ mov.u64 %rd185, 0x0;
382
+ createpolicy.fractional.L2::evict_last.b64 %rd185, 1.0;
383
+ // end inline asm
384
+ // begin inline asm
385
+ mov.u16 %rs43, 0x0;
386
+ @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs43 }, [ %rd186 + 0 ], %rd185;
387
+ // end inline asm
388
+ // begin inline asm
389
+ mov.u64 %rd188, 0x0;
390
+ createpolicy.fractional.L2::evict_last.b64 %rd188, 1.0;
391
+ // end inline asm
392
+ // begin inline asm
393
+ mov.u16 %rs44, 0x0;
394
+ @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs44 }, [ %rd189 + 0 ], %rd188;
395
+ // end inline asm
396
+ // begin inline asm
397
+ mov.u64 %rd191, 0x0;
398
+ createpolicy.fractional.L2::evict_last.b64 %rd191, 1.0;
399
+ // end inline asm
400
+ // begin inline asm
401
+ mov.u16 %rs45, 0x0;
402
+ @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs45 }, [ %rd192 + 0 ], %rd191;
403
+ // end inline asm
404
+ // begin inline asm
405
+ mov.u64 %rd194, 0x0;
406
+ createpolicy.fractional.L2::evict_last.b64 %rd194, 1.0;
407
+ // end inline asm
408
+ // begin inline asm
409
+ mov.u16 %rs46, 0x0;
410
+ @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs46 }, [ %rd195 + 0 ], %rd194;
411
+ // end inline asm
412
+ // begin inline asm
413
+ mov.u64 %rd197, 0x0;
414
+ createpolicy.fractional.L2::evict_last.b64 %rd197, 1.0;
415
+ // end inline asm
416
+ // begin inline asm
417
+ mov.u16 %rs47, 0x0;
418
+ @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs47 }, [ %rd198 + 0 ], %rd197;
419
+ // end inline asm
420
+ // begin inline asm
421
+ mov.u64 %rd200, 0x0;
422
+ createpolicy.fractional.L2::evict_last.b64 %rd200, 1.0;
423
+ // end inline asm
424
+ // begin inline asm
425
+ mov.u16 %rs48, 0x0;
426
+ @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs48 }, [ %rd201 + 0 ], %rd200;
427
+ // end inline asm
428
+ .loc 1 26 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:26:31
429
+ shl.b64 %rd299, %rd674, 3;
430
+ add.s64 %rd261, %rd139, %rd299;
431
+ shl.b64 %rd300, %rd675, 3;
432
+ add.s64 %rd265, %rd139, %rd300;
433
+ shl.b64 %rd301, %rd676, 3;
434
+ add.s64 %rd269, %rd139, %rd301;
435
+ shl.b64 %rd302, %rd677, 3;
436
+ add.s64 %rd273, %rd139, %rd302;
437
+ shl.b64 %rd303, %rd678, 3;
438
+ add.s64 %rd277, %rd139, %rd303;
439
+ shl.b64 %rd304, %rd679, 3;
440
+ add.s64 %rd281, %rd139, %rd304;
441
+ shl.b64 %rd305, %rd680, 3;
442
+ add.s64 %rd285, %rd139, %rd305;
443
+ shl.b64 %rd306, %rd681, 3;
444
+ add.s64 %rd289, %rd139, %rd306;
445
+ .loc 1 26 36 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:26:36
446
+ // begin inline asm
447
+ mov.u64 %rd203, 0x0;
448
+ createpolicy.fractional.L2::evict_last.b64 %rd203, 1.0;
449
+ // end inline asm
450
+ // begin inline asm
451
+ mov.u64 %rd204, 0x0;
452
+ @%p1 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd204 }, [ %rd261 + 0 ], %rd203;
453
+ // end inline asm
454
+ // begin inline asm
455
+ mov.u64 %rd207, 0x0;
456
+ createpolicy.fractional.L2::evict_last.b64 %rd207, 1.0;
457
+ // end inline asm
458
+ // begin inline asm
459
+ mov.u64 %rd208, 0x0;
460
+ @%p2 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd208 }, [ %rd265 + 0 ], %rd207;
461
+ // end inline asm
462
+ // begin inline asm
463
+ mov.u64 %rd211, 0x0;
464
+ createpolicy.fractional.L2::evict_last.b64 %rd211, 1.0;
465
+ // end inline asm
466
+ // begin inline asm
467
+ mov.u64 %rd212, 0x0;
468
+ @%p3 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd212 }, [ %rd269 + 0 ], %rd211;
469
+ // end inline asm
470
+ // begin inline asm
471
+ mov.u64 %rd215, 0x0;
472
+ createpolicy.fractional.L2::evict_last.b64 %rd215, 1.0;
473
+ // end inline asm
474
+ // begin inline asm
475
+ mov.u64 %rd216, 0x0;
476
+ @%p4 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd216 }, [ %rd273 + 0 ], %rd215;
477
+ // end inline asm
478
+ // begin inline asm
479
+ mov.u64 %rd219, 0x0;
480
+ createpolicy.fractional.L2::evict_last.b64 %rd219, 1.0;
481
+ // end inline asm
482
+ // begin inline asm
483
+ mov.u64 %rd220, 0x0;
484
+ @%p5 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd220 }, [ %rd277 + 0 ], %rd219;
485
+ // end inline asm
486
+ // begin inline asm
487
+ mov.u64 %rd223, 0x0;
488
+ createpolicy.fractional.L2::evict_last.b64 %rd223, 1.0;
489
+ // end inline asm
490
+ // begin inline asm
491
+ mov.u64 %rd224, 0x0;
492
+ @%p6 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd224 }, [ %rd281 + 0 ], %rd223;
493
+ // end inline asm
494
+ // begin inline asm
495
+ mov.u64 %rd227, 0x0;
496
+ createpolicy.fractional.L2::evict_last.b64 %rd227, 1.0;
497
+ // end inline asm
498
+ // begin inline asm
499
+ mov.u64 %rd228, 0x0;
500
+ @%p7 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd228 }, [ %rd285 + 0 ], %rd227;
501
+ // end inline asm
502
+ // begin inline asm
503
+ mov.u64 %rd231, 0x0;
504
+ createpolicy.fractional.L2::evict_last.b64 %rd231, 1.0;
505
+ // end inline asm
506
+ // begin inline asm
507
+ mov.u64 %rd232, 0x0;
508
+ @%p8 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd232 }, [ %rd289 + 0 ], %rd231;
509
+ // end inline asm
510
+ .loc 1 28 18 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:28:18
511
+ shr.u64 %rd307, %rd143, 63;
512
+ add.s64 %rd308, %rd143, %rd307;
513
+ shr.s64 %rd89, %rd308, 1;
514
+ .loc 1 30 35 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:35
515
+ sub.s64 %rd309, %rd1, %rd89;
516
+ sub.s64 %rd310, %rd2, %rd89;
517
+ sub.s64 %rd311, %rd3, %rd89;
518
+ sub.s64 %rd312, %rd4, %rd89;
519
+ sub.s64 %rd313, %rd5, %rd89;
520
+ sub.s64 %rd314, %rd6, %rd89;
521
+ sub.s64 %rd315, %rd7, %rd89;
522
+ sub.s64 %rd316, %rd8, %rd89;
523
+ .loc 1 30 30 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:30
524
+ shl.b64 %rd317, %rd309, 1;
525
+ add.s64 %rd236, %rd138, %rd317;
526
+ shl.b64 %rd318, %rd310, 1;
527
+ add.s64 %rd239, %rd138, %rd318;
528
+ shl.b64 %rd319, %rd311, 1;
529
+ add.s64 %rd242, %rd138, %rd319;
530
+ shl.b64 %rd320, %rd312, 1;
531
+ add.s64 %rd245, %rd138, %rd320;
532
+ shl.b64 %rd321, %rd313, 1;
533
+ add.s64 %rd248, %rd138, %rd321;
534
+ shl.b64 %rd322, %rd314, 1;
535
+ add.s64 %rd251, %rd138, %rd322;
536
+ shl.b64 %rd323, %rd315, 1;
537
+ add.s64 %rd254, %rd138, %rd323;
538
+ shl.b64 %rd324, %rd316, 1;
539
+ add.s64 %rd257, %rd138, %rd324;
540
+ .loc 1 30 53 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:53
541
+ // begin inline asm
542
+ mov.u64 %rd235, 0x0;
543
+ createpolicy.fractional.L2::evict_last.b64 %rd235, 1.0;
544
+ // end inline asm
545
+ .loc 1 29 19 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:29:19
546
+ setp.ge.s64 %p73, %rd9, %rd89;
547
+ setp.ge.s64 %p74, %rd10, %rd89;
548
+ setp.ge.s64 %p75, %rd11, %rd89;
549
+ setp.ge.s64 %p76, %rd12, %rd89;
550
+ setp.ge.s64 %p77, %rd13, %rd89;
551
+ setp.ge.s64 %p78, %rd14, %rd89;
552
+ setp.ge.s64 %p79, %rd15, %rd89;
553
+ setp.ge.s64 %p80, %rd16, %rd89;
554
+ .loc 1 30 60 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:60
555
+ and.pred %p16, %p8, %p80;
556
+ and.pred %p15, %p7, %p79;
557
+ and.pred %p14, %p6, %p78;
558
+ and.pred %p13, %p5, %p77;
559
+ and.pred %p12, %p4, %p76;
560
+ and.pred %p11, %p3, %p75;
561
+ and.pred %p10, %p2, %p74;
562
+ and.pred %p9, %p1, %p73;
563
+ mov.b16 %rs64, 0;
564
+ .loc 1 30 53 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:53
565
+ // begin inline asm
566
+ mov.u16 %rs49, %rs64;
567
+ @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs49 }, [ %rd236 + 0 ], %rd235;
568
+ // end inline asm
569
+ // begin inline asm
570
+ mov.u64 %rd238, 0x0;
571
+ createpolicy.fractional.L2::evict_last.b64 %rd238, 1.0;
572
+ // end inline asm
573
+ // begin inline asm
574
+ mov.u16 %rs51, %rs64;
575
+ @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs51 }, [ %rd239 + 0 ], %rd238;
576
+ // end inline asm
577
+ // begin inline asm
578
+ mov.u64 %rd241, 0x0;
579
+ createpolicy.fractional.L2::evict_last.b64 %rd241, 1.0;
580
+ // end inline asm
581
+ // begin inline asm
582
+ mov.u16 %rs53, %rs64;
583
+ @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs53 }, [ %rd242 + 0 ], %rd241;
584
+ // end inline asm
585
+ // begin inline asm
586
+ mov.u64 %rd244, 0x0;
587
+ createpolicy.fractional.L2::evict_last.b64 %rd244, 1.0;
588
+ // end inline asm
589
+ // begin inline asm
590
+ mov.u16 %rs55, %rs64;
591
+ @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs55 }, [ %rd245 + 0 ], %rd244;
592
+ // end inline asm
593
+ // begin inline asm
594
+ mov.u64 %rd247, 0x0;
595
+ createpolicy.fractional.L2::evict_last.b64 %rd247, 1.0;
596
+ // end inline asm
597
+ // begin inline asm
598
+ mov.u16 %rs57, %rs64;
599
+ @%p13 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs57 }, [ %rd248 + 0 ], %rd247;
600
+ // end inline asm
601
+ // begin inline asm
602
+ mov.u64 %rd250, 0x0;
603
+ createpolicy.fractional.L2::evict_last.b64 %rd250, 1.0;
604
+ // end inline asm
605
+ // begin inline asm
606
+ mov.u16 %rs59, %rs64;
607
+ @%p14 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs59 }, [ %rd251 + 0 ], %rd250;
608
+ // end inline asm
609
+ // begin inline asm
610
+ mov.u64 %rd253, 0x0;
611
+ createpolicy.fractional.L2::evict_last.b64 %rd253, 1.0;
612
+ // end inline asm
613
+ // begin inline asm
614
+ mov.u16 %rs61, %rs64;
615
+ @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs61 }, [ %rd254 + 0 ], %rd253;
616
+ // end inline asm
617
+ // begin inline asm
618
+ mov.u64 %rd256, 0x0;
619
+ createpolicy.fractional.L2::evict_last.b64 %rd256, 1.0;
620
+ // end inline asm
621
+ // begin inline asm
622
+ mov.u16 %rs63, %rs64;
623
+ @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs63 }, [ %rd257 + 0 ], %rd256;
624
+ // end inline asm
625
+ .loc 1 31 35 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:31:35
626
+ // begin inline asm
627
+ mov.u64 %rd259, 0x0;
628
+ createpolicy.fractional.L2::evict_last.b64 %rd259, 1.0;
629
+ // end inline asm
630
+ // begin inline asm
631
+ mov.u64 %rd260, 0x0;
632
+ @%p9 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd260 }, [ %rd261 + 0 ], %rd259;
633
+ // end inline asm
634
+ // begin inline asm
635
+ mov.u64 %rd263, 0x0;
636
+ createpolicy.fractional.L2::evict_last.b64 %rd263, 1.0;
637
+ // end inline asm
638
+ // begin inline asm
639
+ mov.u64 %rd264, 0x0;
640
+ @%p10 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd264 }, [ %rd265 + 0 ], %rd263;
641
+ // end inline asm
642
+ // begin inline asm
643
+ mov.u64 %rd267, 0x0;
644
+ createpolicy.fractional.L2::evict_last.b64 %rd267, 1.0;
645
+ // end inline asm
646
+ // begin inline asm
647
+ mov.u64 %rd268, 0x0;
648
+ @%p11 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd268 }, [ %rd269 + 0 ], %rd267;
649
+ // end inline asm
650
+ // begin inline asm
651
+ mov.u64 %rd271, 0x0;
652
+ createpolicy.fractional.L2::evict_last.b64 %rd271, 1.0;
653
+ // end inline asm
654
+ // begin inline asm
655
+ mov.u64 %rd272, 0x0;
656
+ @%p12 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd272 }, [ %rd273 + 0 ], %rd271;
657
+ // end inline asm
658
+ // begin inline asm
659
+ mov.u64 %rd275, 0x0;
660
+ createpolicy.fractional.L2::evict_last.b64 %rd275, 1.0;
661
+ // end inline asm
662
+ // begin inline asm
663
+ mov.u64 %rd276, 0x0;
664
+ @%p13 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd276 }, [ %rd277 + 0 ], %rd275;
665
+ // end inline asm
666
+ // begin inline asm
667
+ mov.u64 %rd279, 0x0;
668
+ createpolicy.fractional.L2::evict_last.b64 %rd279, 1.0;
669
+ // end inline asm
670
+ // begin inline asm
671
+ mov.u64 %rd280, 0x0;
672
+ @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd280 }, [ %rd281 + 0 ], %rd279;
673
+ // end inline asm
674
+ // begin inline asm
675
+ mov.u64 %rd283, 0x0;
676
+ createpolicy.fractional.L2::evict_last.b64 %rd283, 1.0;
677
+ // end inline asm
678
+ // begin inline asm
679
+ mov.u64 %rd284, 0x0;
680
+ @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd284 }, [ %rd285 + 0 ], %rd283;
681
+ // end inline asm
682
+ // begin inline asm
683
+ mov.u64 %rd287, 0x0;
684
+ createpolicy.fractional.L2::evict_last.b64 %rd287, 1.0;
685
+ // end inline asm
686
+ // begin inline asm
687
+ mov.u64 %rd288, 0x0;
688
+ @%p16 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd288 }, [ %rd289 + 0 ], %rd287;
689
+ // end inline asm
690
+ .loc 1 35 32 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:35:32
691
+ shr.s64 %rd325, %rd268, 63;
692
+ and.b64 %rd326, %rd325, %rd145;
693
+ shr.s64 %rd327, %rd272, 63;
694
+ and.b64 %rd328, %rd327, %rd145;
695
+ shr.s64 %rd329, %rd260, 63;
696
+ and.b64 %rd330, %rd329, %rd145;
697
+ shr.s64 %rd331, %rd264, 63;
698
+ and.b64 %rd332, %rd331, %rd145;
699
+ shr.s64 %rd333, %rd284, 63;
700
+ and.b64 %rd334, %rd333, %rd145;
701
+ shr.s64 %rd335, %rd288, 63;
702
+ and.b64 %rd336, %rd335, %rd145;
703
+ shr.s64 %rd337, %rd276, 63;
704
+ and.b64 %rd338, %rd337, %rd145;
705
+ shr.s64 %rd339, %rd280, 63;
706
+ and.b64 %rd340, %rd339, %rd145;
707
+ add.s64 %rd111, %rd340, %rd280;
708
+ add.s64 %rd110, %rd338, %rd276;
709
+ add.s64 %rd113, %rd336, %rd288;
710
+ add.s64 %rd112, %rd334, %rd284;
711
+ add.s64 %rd107, %rd332, %rd264;
712
+ add.s64 %rd106, %rd330, %rd260;
713
+ add.s64 %rd109, %rd328, %rd272;
714
+ add.s64 %rd108, %rd326, %rd268;
715
+ .loc 1 36 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:28
716
+ setp.lt.s64 %p81, %rd108, 0;
717
+ setp.lt.s64 %p82, %rd109, 0;
718
+ setp.lt.s64 %p83, %rd106, 0;
719
+ setp.lt.s64 %p84, %rd107, 0;
720
+ setp.lt.s64 %p85, %rd112, 0;
721
+ setp.lt.s64 %p86, %rd113, 0;
722
+ setp.lt.s64 %p87, %rd110, 0;
723
+ setp.lt.s64 %p88, %rd111, 0;
724
+ .loc 1 36 98 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:98
725
+ setp.ge.s64 %p89, %rd108, %rd145;
726
+ setp.ge.s64 %p90, %rd109, %rd145;
727
+ setp.ge.s64 %p91, %rd106, %rd145;
728
+ setp.ge.s64 %p92, %rd107, %rd145;
729
+ setp.ge.s64 %p93, %rd112, %rd145;
730
+ setp.ge.s64 %p94, %rd113, %rd145;
731
+ setp.ge.s64 %p95, %rd110, %rd145;
732
+ setp.ge.s64 %p96, %rd111, %rd145;
733
+ .loc 1 36 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:64
734
+ or.pred %p97, %p88, %p96;
735
+ or.pred %p98, %p87, %p95;
736
+ or.pred %p99, %p86, %p94;
737
+ or.pred %p100, %p85, %p93;
738
+ or.pred %p101, %p84, %p92;
739
+ or.pred %p102, %p83, %p91;
740
+ or.pred %p103, %p82, %p90;
741
+ or.pred %p104, %p81, %p89;
742
+ .loc 1 36 106 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:106
743
+ and.pred %p105, %p11, %p104;
744
+ selp.b16 %rs65, 1, 0, %p105;
745
+ shl.b16 %rs66, %rs65, 2;
746
+ and.pred %p106, %p12, %p103;
747
+ selp.b16 %rs67, -1, 0, %p106;
748
+ shl.b16 %rs68, %rs67, 3;
749
+ or.b16 %rs69, %rs68, %rs66;
750
+ and.pred %p107, %p9, %p102;
751
+ selp.b16 %rs70, 1, 0, %p107;
752
+ and.pred %p108, %p10, %p101;
753
+ selp.b16 %rs71, -1, 0, %p108;
754
+ shl.b16 %rs72, %rs71, 1;
755
+ or.b16 %rs73, %rs70, %rs72;
756
+ and.b16 %rs74, %rs73, 3;
757
+ or.b16 %rs75, %rs74, %rs69;
758
+ and.b16 %rs76, %rs75, 15;
759
+ and.pred %p109, %p15, %p100;
760
+ selp.b16 %rs77, 1, 0, %p109;
761
+ shl.b16 %rs78, %rs77, 2;
762
+ and.pred %p110, %p16, %p99;
763
+ selp.b16 %rs79, -1, 0, %p110;
764
+ shl.b16 %rs80, %rs79, 3;
765
+ or.b16 %rs81, %rs80, %rs78;
766
+ and.pred %p111, %p13, %p98;
767
+ selp.b16 %rs82, 1, 0, %p111;
768
+ and.pred %p112, %p14, %p97;
769
+ selp.b16 %rs83, -1, 0, %p112;
770
+ shl.b16 %rs84, %rs83, 1;
771
+ or.b16 %rs85, %rs82, %rs84;
772
+ and.b16 %rs86, %rs85, 3;
773
+ or.b16 %rs87, %rs86, %rs81;
774
+ shl.b16 %rs88, %rs87, 4;
775
+ or.b16 %rs89, %rs76, %rs88;
776
+ .loc 1 36 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:123
777
+ and.b16 %rs90, %rs89, 255;
778
+ setp.eq.b16 %p113, %rs90, 0;
779
+ @%p113 bra $L__BB0_50;
780
+ bra.uni $L__BB0_49;
781
+ $L__BB0_50:
782
+ .loc 1 0 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0:123
783
+ ld.param.b64 %rd140, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_2];
784
+ .loc 1 36 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:123
785
+ bar.sync 0;
786
+ .loc 1 37 36 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:36
787
+ sub.s64 %rd427, %rd9, %rd89;
788
+ sub.s64 %rd428, %rd10, %rd89;
789
+ sub.s64 %rd429, %rd11, %rd89;
790
+ sub.s64 %rd430, %rd12, %rd89;
791
+ sub.s64 %rd431, %rd13, %rd89;
792
+ sub.s64 %rd432, %rd14, %rd89;
793
+ sub.s64 %rd433, %rd15, %rd89;
794
+ sub.s64 %rd434, %rd16, %rd89;
795
+ .loc 1 37 58 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:58
796
+ mul.lo.s64 %rd435, %rd106, %rd143;
797
+ mul.lo.s64 %rd436, %rd107, %rd143;
798
+ mul.lo.s64 %rd437, %rd108, %rd143;
799
+ mul.lo.s64 %rd438, %rd109, %rd143;
800
+ mul.lo.s64 %rd439, %rd110, %rd143;
801
+ mul.lo.s64 %rd440, %rd111, %rd143;
802
+ mul.lo.s64 %rd441, %rd112, %rd143;
803
+ mul.lo.s64 %rd442, %rd113, %rd143;
804
+ .loc 1 37 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:31
805
+ shl.b64 %rd443, %rd427, 1;
806
+ add.s64 %rd444, %rd140, %rd443;
807
+ shl.b64 %rd445, %rd435, 1;
808
+ add.s64 %rd348, %rd444, %rd445;
809
+ shl.b64 %rd446, %rd428, 1;
810
+ add.s64 %rd447, %rd140, %rd446;
811
+ shl.b64 %rd448, %rd436, 1;
812
+ add.s64 %rd351, %rd447, %rd448;
813
+ shl.b64 %rd449, %rd429, 1;
814
+ add.s64 %rd450, %rd140, %rd449;
815
+ shl.b64 %rd451, %rd437, 1;
816
+ add.s64 %rd354, %rd450, %rd451;
817
+ shl.b64 %rd452, %rd430, 1;
818
+ add.s64 %rd453, %rd140, %rd452;
819
+ shl.b64 %rd454, %rd438, 1;
820
+ add.s64 %rd357, %rd453, %rd454;
821
+ shl.b64 %rd455, %rd431, 1;
822
+ add.s64 %rd456, %rd140, %rd455;
823
+ shl.b64 %rd457, %rd439, 1;
824
+ add.s64 %rd360, %rd456, %rd457;
825
+ shl.b64 %rd458, %rd432, 1;
826
+ add.s64 %rd459, %rd140, %rd458;
827
+ shl.b64 %rd460, %rd440, 1;
828
+ add.s64 %rd363, %rd459, %rd460;
829
+ shl.b64 %rd461, %rd433, 1;
830
+ add.s64 %rd462, %rd140, %rd461;
831
+ shl.b64 %rd463, %rd441, 1;
832
+ add.s64 %rd366, %rd462, %rd463;
833
+ shl.b64 %rd464, %rd434, 1;
834
+ add.s64 %rd465, %rd140, %rd464;
835
+ shl.b64 %rd466, %rd442, 1;
836
+ add.s64 %rd369, %rd465, %rd466;
837
+ .loc 1 37 65 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:65
838
+ // begin inline asm
839
+ mov.u64 %rd347, 0x0;
840
+ createpolicy.fractional.L2::evict_last.b64 %rd347, 1.0;
841
+ // end inline asm
842
+ // begin inline asm
843
+ mov.u16 %rs91, %rs64;
844
+ @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs91 }, [ %rd348 + 0 ], %rd347;
845
+ // end inline asm
846
+ // begin inline asm
847
+ mov.u64 %rd350, 0x0;
848
+ createpolicy.fractional.L2::evict_last.b64 %rd350, 1.0;
849
+ // end inline asm
850
+ // begin inline asm
851
+ mov.u16 %rs93, %rs64;
852
+ @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs93 }, [ %rd351 + 0 ], %rd350;
853
+ // end inline asm
854
+ // begin inline asm
855
+ mov.u64 %rd353, 0x0;
856
+ createpolicy.fractional.L2::evict_last.b64 %rd353, 1.0;
857
+ // end inline asm
858
+ // begin inline asm
859
+ mov.u16 %rs95, %rs64;
860
+ @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs95 }, [ %rd354 + 0 ], %rd353;
861
+ // end inline asm
862
+ // begin inline asm
863
+ mov.u64 %rd356, 0x0;
864
+ createpolicy.fractional.L2::evict_last.b64 %rd356, 1.0;
865
+ // end inline asm
866
+ // begin inline asm
867
+ mov.u16 %rs97, %rs64;
868
+ @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs97 }, [ %rd357 + 0 ], %rd356;
869
+ // end inline asm
870
+ // begin inline asm
871
+ mov.u64 %rd359, 0x0;
872
+ createpolicy.fractional.L2::evict_last.b64 %rd359, 1.0;
873
+ // end inline asm
874
+ // begin inline asm
875
+ mov.u16 %rs99, %rs64;
876
+ @%p13 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs99 }, [ %rd360 + 0 ], %rd359;
877
+ // end inline asm
878
+ // begin inline asm
879
+ mov.u64 %rd362, 0x0;
880
+ createpolicy.fractional.L2::evict_last.b64 %rd362, 1.0;
881
+ // end inline asm
882
+ // begin inline asm
883
+ mov.u16 %rs101, %rs64;
884
+ @%p14 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs101 }, [ %rd363 + 0 ], %rd362;
885
+ // end inline asm
886
+ // begin inline asm
887
+ mov.u64 %rd365, 0x0;
888
+ createpolicy.fractional.L2::evict_last.b64 %rd365, 1.0;
889
+ // end inline asm
890
+ // begin inline asm
891
+ mov.u16 %rs103, %rs64;
892
+ @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs103 }, [ %rd366 + 0 ], %rd365;
893
+ // end inline asm
894
+ // begin inline asm
895
+ mov.u64 %rd368, 0x0;
896
+ createpolicy.fractional.L2::evict_last.b64 %rd368, 1.0;
897
+ // end inline asm
898
+ // begin inline asm
899
+ mov.u16 %rs105, %rs64;
900
+ @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs105 }, [ %rd369 + 0 ], %rd368;
901
+ // end inline asm
902
+ .loc 1 44 19 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:44:19
903
+ setp.lt.s64 %p146, %rd9, %rd89;
904
+ setp.lt.s64 %p147, %rd10, %rd89;
905
+ setp.lt.s64 %p148, %rd11, %rd89;
906
+ setp.lt.s64 %p149, %rd12, %rd89;
907
+ setp.lt.s64 %p150, %rd13, %rd89;
908
+ setp.lt.s64 %p151, %rd14, %rd89;
909
+ setp.lt.s64 %p152, %rd15, %rd89;
910
+ setp.lt.s64 %p153, %rd16, %rd89;
911
+ .loc 1 45 37 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:37
912
+ add.s64 %rd467, %rd143, %rd1;
913
+ add.s64 %rd468, %rd143, %rd2;
914
+ add.s64 %rd469, %rd143, %rd3;
915
+ add.s64 %rd470, %rd143, %rd4;
916
+ add.s64 %rd471, %rd143, %rd5;
917
+ add.s64 %rd472, %rd143, %rd6;
918
+ add.s64 %rd473, %rd143, %rd7;
919
+ add.s64 %rd474, %rd143, %rd8;
920
+ .loc 1 45 42 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:42
921
+ sub.s64 %rd475, %rd467, %rd89;
922
+ sub.s64 %rd476, %rd468, %rd89;
923
+ sub.s64 %rd477, %rd469, %rd89;
924
+ sub.s64 %rd478, %rd470, %rd89;
925
+ sub.s64 %rd479, %rd471, %rd89;
926
+ sub.s64 %rd480, %rd472, %rd89;
927
+ sub.s64 %rd481, %rd473, %rd89;
928
+ sub.s64 %rd482, %rd474, %rd89;
929
+ .loc 1 45 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:31
930
+ shl.b64 %rd483, %rd475, 1;
931
+ add.s64 %rd372, %rd138, %rd483;
932
+ shl.b64 %rd484, %rd476, 1;
933
+ add.s64 %rd375, %rd138, %rd484;
934
+ shl.b64 %rd485, %rd477, 1;
935
+ add.s64 %rd378, %rd138, %rd485;
936
+ shl.b64 %rd486, %rd478, 1;
937
+ add.s64 %rd381, %rd138, %rd486;
938
+ shl.b64 %rd487, %rd479, 1;
939
+ add.s64 %rd384, %rd138, %rd487;
940
+ shl.b64 %rd488, %rd480, 1;
941
+ add.s64 %rd387, %rd138, %rd488;
942
+ shl.b64 %rd489, %rd481, 1;
943
+ add.s64 %rd390, %rd138, %rd489;
944
+ shl.b64 %rd490, %rd482, 1;
945
+ add.s64 %rd393, %rd138, %rd490;
946
+ .loc 1 45 60 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:60
947
+ // begin inline asm
948
+ mov.u64 %rd371, 0x0;
949
+ createpolicy.fractional.L2::evict_last.b64 %rd371, 1.0;
950
+ // end inline asm
951
+ .loc 1 45 68 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:68
952
+ and.pred %p24, %p8, %p153;
953
+ and.pred %p23, %p7, %p152;
954
+ and.pred %p22, %p6, %p151;
955
+ and.pred %p21, %p5, %p150;
956
+ and.pred %p20, %p4, %p149;
957
+ and.pred %p19, %p3, %p148;
958
+ and.pred %p18, %p2, %p147;
959
+ and.pred %p17, %p1, %p146;
960
+ .loc 1 45 60 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:60
961
+ // begin inline asm
962
+ mov.u16 %rs107, %rs64;
963
+ @%p17 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs107 }, [ %rd372 + 0 ], %rd371;
964
+ // end inline asm
965
+ // begin inline asm
966
+ mov.u64 %rd374, 0x0;
967
+ createpolicy.fractional.L2::evict_last.b64 %rd374, 1.0;
968
+ // end inline asm
969
+ // begin inline asm
970
+ mov.u16 %rs109, %rs64;
971
+ @%p18 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs109 }, [ %rd375 + 0 ], %rd374;
972
+ // end inline asm
973
+ // begin inline asm
974
+ mov.u64 %rd377, 0x0;
975
+ createpolicy.fractional.L2::evict_last.b64 %rd377, 1.0;
976
+ // end inline asm
977
+ // begin inline asm
978
+ mov.u16 %rs111, %rs64;
979
+ @%p19 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs111 }, [ %rd378 + 0 ], %rd377;
980
+ // end inline asm
981
+ // begin inline asm
982
+ mov.u64 %rd380, 0x0;
983
+ createpolicy.fractional.L2::evict_last.b64 %rd380, 1.0;
984
+ // end inline asm
985
+ // begin inline asm
986
+ mov.u16 %rs113, %rs64;
987
+ @%p20 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs113 }, [ %rd381 + 0 ], %rd380;
988
+ // end inline asm
989
+ // begin inline asm
990
+ mov.u64 %rd383, 0x0;
991
+ createpolicy.fractional.L2::evict_last.b64 %rd383, 1.0;
992
+ // end inline asm
993
+ // begin inline asm
994
+ mov.u16 %rs115, %rs64;
995
+ @%p21 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs115 }, [ %rd384 + 0 ], %rd383;
996
+ // end inline asm
997
+ // begin inline asm
998
+ mov.u64 %rd386, 0x0;
999
+ createpolicy.fractional.L2::evict_last.b64 %rd386, 1.0;
1000
+ // end inline asm
1001
+ // begin inline asm
1002
+ mov.u16 %rs117, %rs64;
1003
+ @%p22 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs117 }, [ %rd387 + 0 ], %rd386;
1004
+ // end inline asm
1005
+ // begin inline asm
1006
+ mov.u64 %rd389, 0x0;
1007
+ createpolicy.fractional.L2::evict_last.b64 %rd389, 1.0;
1008
+ // end inline asm
1009
+ // begin inline asm
1010
+ mov.u16 %rs119, %rs64;
1011
+ @%p23 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs119 }, [ %rd390 + 0 ], %rd389;
1012
+ // end inline asm
1013
+ // begin inline asm
1014
+ mov.u64 %rd392, 0x0;
1015
+ createpolicy.fractional.L2::evict_last.b64 %rd392, 1.0;
1016
+ // end inline asm
1017
+ // begin inline asm
1018
+ mov.u16 %rs121, %rs64;
1019
+ @%p24 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs121 }, [ %rd393 + 0 ], %rd392;
1020
+ // end inline asm
1021
+ .loc 1 46 36 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:46:36
1022
+ // begin inline asm
1023
+ mov.u64 %rd395, 0x0;
1024
+ createpolicy.fractional.L2::evict_last.b64 %rd395, 1.0;
1025
+ // end inline asm
1026
+ // begin inline asm
1027
+ mov.u64 %rd396, 0x0;
1028
+ @%p17 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd396 }, [ %rd261 + 0 ], %rd395;
1029
+ // end inline asm
1030
+ // begin inline asm
1031
+ mov.u64 %rd399, 0x0;
1032
+ createpolicy.fractional.L2::evict_last.b64 %rd399, 1.0;
1033
+ // end inline asm
1034
+ // begin inline asm
1035
+ mov.u64 %rd400, 0x0;
1036
+ @%p18 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd400 }, [ %rd265 + 0 ], %rd399;
1037
+ // end inline asm
1038
+ // begin inline asm
1039
+ mov.u64 %rd403, 0x0;
1040
+ createpolicy.fractional.L2::evict_last.b64 %rd403, 1.0;
1041
+ // end inline asm
1042
+ // begin inline asm
1043
+ mov.u64 %rd404, 0x0;
1044
+ @%p19 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd404 }, [ %rd269 + 0 ], %rd403;
1045
+ // end inline asm
1046
+ // begin inline asm
1047
+ mov.u64 %rd407, 0x0;
1048
+ createpolicy.fractional.L2::evict_last.b64 %rd407, 1.0;
1049
+ // end inline asm
1050
+ // begin inline asm
1051
+ mov.u64 %rd408, 0x0;
1052
+ @%p20 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd408 }, [ %rd273 + 0 ], %rd407;
1053
+ // end inline asm
1054
+ // begin inline asm
1055
+ mov.u64 %rd411, 0x0;
1056
+ createpolicy.fractional.L2::evict_last.b64 %rd411, 1.0;
1057
+ // end inline asm
1058
+ // begin inline asm
1059
+ mov.u64 %rd412, 0x0;
1060
+ @%p21 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd412 }, [ %rd277 + 0 ], %rd411;
1061
+ // end inline asm
1062
+ // begin inline asm
1063
+ mov.u64 %rd415, 0x0;
1064
+ createpolicy.fractional.L2::evict_last.b64 %rd415, 1.0;
1065
+ // end inline asm
1066
+ // begin inline asm
1067
+ mov.u64 %rd416, 0x0;
1068
+ @%p22 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd416 }, [ %rd281 + 0 ], %rd415;
1069
+ // end inline asm
1070
+ // begin inline asm
1071
+ mov.u64 %rd419, 0x0;
1072
+ createpolicy.fractional.L2::evict_last.b64 %rd419, 1.0;
1073
+ // end inline asm
1074
+ // begin inline asm
1075
+ mov.u64 %rd420, 0x0;
1076
+ @%p23 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd420 }, [ %rd285 + 0 ], %rd419;
1077
+ // end inline asm
1078
+ // begin inline asm
1079
+ mov.u64 %rd423, 0x0;
1080
+ createpolicy.fractional.L2::evict_last.b64 %rd423, 1.0;
1081
+ // end inline asm
1082
+ // begin inline asm
1083
+ mov.u64 %rd424, 0x0;
1084
+ @%p24 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd424 }, [ %rd289 + 0 ], %rd423;
1085
+ // end inline asm
1086
+ .loc 1 50 35 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:50:35
1087
+ shr.s64 %rd491, %rd404, 63;
1088
+ and.b64 %rd492, %rd491, %rd145;
1089
+ shr.s64 %rd493, %rd408, 63;
1090
+ and.b64 %rd494, %rd493, %rd145;
1091
+ shr.s64 %rd495, %rd396, 63;
1092
+ and.b64 %rd496, %rd495, %rd145;
1093
+ shr.s64 %rd497, %rd400, 63;
1094
+ and.b64 %rd498, %rd497, %rd145;
1095
+ shr.s64 %rd499, %rd420, 63;
1096
+ and.b64 %rd500, %rd499, %rd145;
1097
+ shr.s64 %rd501, %rd424, 63;
1098
+ and.b64 %rd502, %rd501, %rd145;
1099
+ shr.s64 %rd503, %rd412, 63;
1100
+ and.b64 %rd504, %rd503, %rd145;
1101
+ shr.s64 %rd505, %rd416, 63;
1102
+ and.b64 %rd506, %rd505, %rd145;
1103
+ add.s64 %rd127, %rd506, %rd416;
1104
+ add.s64 %rd126, %rd504, %rd412;
1105
+ add.s64 %rd129, %rd502, %rd424;
1106
+ add.s64 %rd128, %rd500, %rd420;
1107
+ add.s64 %rd123, %rd498, %rd400;
1108
+ add.s64 %rd122, %rd496, %rd396;
1109
+ add.s64 %rd125, %rd494, %rd408;
1110
+ add.s64 %rd124, %rd492, %rd404;
1111
+ .loc 1 51 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:28
1112
+ setp.lt.s64 %p154, %rd124, 0;
1113
+ setp.lt.s64 %p155, %rd125, 0;
1114
+ setp.lt.s64 %p156, %rd122, 0;
1115
+ setp.lt.s64 %p157, %rd123, 0;
1116
+ setp.lt.s64 %p158, %rd128, 0;
1117
+ setp.lt.s64 %p159, %rd129, 0;
1118
+ setp.lt.s64 %p160, %rd126, 0;
1119
+ setp.lt.s64 %p161, %rd127, 0;
1120
+ .loc 1 51 100 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:100
1121
+ setp.ge.s64 %p162, %rd124, %rd145;
1122
+ setp.ge.s64 %p163, %rd125, %rd145;
1123
+ setp.ge.s64 %p164, %rd122, %rd145;
1124
+ setp.ge.s64 %p165, %rd123, %rd145;
1125
+ setp.ge.s64 %p166, %rd128, %rd145;
1126
+ setp.ge.s64 %p167, %rd129, %rd145;
1127
+ setp.ge.s64 %p168, %rd126, %rd145;
1128
+ setp.ge.s64 %p169, %rd127, %rd145;
1129
+ .loc 1 51 65 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:65
1130
+ or.pred %p170, %p161, %p169;
1131
+ or.pred %p171, %p160, %p168;
1132
+ or.pred %p172, %p159, %p167;
1133
+ or.pred %p173, %p158, %p166;
1134
+ or.pred %p174, %p157, %p165;
1135
+ or.pred %p175, %p156, %p164;
1136
+ or.pred %p176, %p155, %p163;
1137
+ or.pred %p177, %p154, %p162;
1138
+ .loc 1 51 108 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:108
1139
+ and.pred %p178, %p19, %p177;
1140
+ selp.b16 %rs123, 1, 0, %p178;
1141
+ shl.b16 %rs124, %rs123, 2;
1142
+ and.pred %p179, %p20, %p176;
1143
+ selp.b16 %rs125, -1, 0, %p179;
1144
+ shl.b16 %rs126, %rs125, 3;
1145
+ or.b16 %rs127, %rs126, %rs124;
1146
+ and.pred %p180, %p17, %p175;
1147
+ selp.b16 %rs128, 1, 0, %p180;
1148
+ and.pred %p181, %p18, %p174;
1149
+ selp.b16 %rs129, -1, 0, %p181;
1150
+ shl.b16 %rs130, %rs129, 1;
1151
+ or.b16 %rs131, %rs128, %rs130;
1152
+ and.b16 %rs132, %rs131, 3;
1153
+ or.b16 %rs133, %rs132, %rs127;
1154
+ and.b16 %rs134, %rs133, 15;
1155
+ and.pred %p182, %p23, %p173;
1156
+ selp.b16 %rs135, 1, 0, %p182;
1157
+ shl.b16 %rs136, %rs135, 2;
1158
+ and.pred %p183, %p24, %p172;
1159
+ selp.b16 %rs137, -1, 0, %p183;
1160
+ shl.b16 %rs138, %rs137, 3;
1161
+ or.b16 %rs139, %rs138, %rs136;
1162
+ and.pred %p184, %p21, %p171;
1163
+ selp.b16 %rs140, 1, 0, %p184;
1164
+ and.pred %p185, %p22, %p170;
1165
+ selp.b16 %rs141, -1, 0, %p185;
1166
+ shl.b16 %rs142, %rs141, 1;
1167
+ or.b16 %rs143, %rs140, %rs142;
1168
+ and.b16 %rs144, %rs143, 3;
1169
+ or.b16 %rs145, %rs144, %rs139;
1170
+ shl.b16 %rs146, %rs145, 4;
1171
+ or.b16 %rs147, %rs134, %rs146;
1172
+ .loc 1 51 126 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:126
1173
+ and.b16 %rs148, %rs147, 255;
1174
+ setp.eq.b16 %p186, %rs148, 0;
1175
+ @%p186 bra $L__BB0_52;
1176
+ bra.uni $L__BB0_51;
1177
+ $L__BB0_52:
1178
+ .loc 1 0 126 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0:126
1179
+ ld.param.b64 %rd146, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_8];
1180
+ .loc 1 51 126 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:126
1181
+ bar.sync 0;
1182
+ .loc 1 52 37 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:37
1183
+ sub.s64 %rd537, %rd143, %rd89;
1184
+ .loc 1 52 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:64
1185
+ mul.lo.s64 %rd538, %rd122, %rd143;
1186
+ mul.lo.s64 %rd539, %rd123, %rd143;
1187
+ mul.lo.s64 %rd540, %rd124, %rd143;
1188
+ mul.lo.s64 %rd541, %rd125, %rd143;
1189
+ mul.lo.s64 %rd542, %rd126, %rd143;
1190
+ mul.lo.s64 %rd543, %rd127, %rd143;
1191
+ mul.lo.s64 %rd544, %rd128, %rd143;
1192
+ mul.lo.s64 %rd545, %rd129, %rd143;
1193
+ .loc 1 52 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:31
1194
+ shl.b64 %rd546, %rd537, 1;
1195
+ add.s64 %rd547, %rd140, %rd546;
1196
+ shl.b64 %rd548, %rd9, 1;
1197
+ add.s64 %rd549, %rd547, %rd548;
1198
+ shl.b64 %rd550, %rd538, 1;
1199
+ add.s64 %rd514, %rd549, %rd550;
1200
+ shl.b64 %rd551, %rd10, 1;
1201
+ add.s64 %rd552, %rd547, %rd551;
1202
+ shl.b64 %rd553, %rd539, 1;
1203
+ add.s64 %rd517, %rd552, %rd553;
1204
+ shl.b64 %rd554, %rd11, 1;
1205
+ add.s64 %rd555, %rd547, %rd554;
1206
+ shl.b64 %rd556, %rd540, 1;
1207
+ add.s64 %rd520, %rd555, %rd556;
1208
+ shl.b64 %rd557, %rd12, 1;
1209
+ add.s64 %rd558, %rd547, %rd557;
1210
+ shl.b64 %rd559, %rd541, 1;
1211
+ add.s64 %rd523, %rd558, %rd559;
1212
+ shl.b64 %rd560, %rd13, 1;
1213
+ add.s64 %rd561, %rd547, %rd560;
1214
+ shl.b64 %rd562, %rd542, 1;
1215
+ add.s64 %rd526, %rd561, %rd562;
1216
+ shl.b64 %rd563, %rd14, 1;
1217
+ add.s64 %rd564, %rd547, %rd563;
1218
+ shl.b64 %rd565, %rd543, 1;
1219
+ add.s64 %rd529, %rd564, %rd565;
1220
+ shl.b64 %rd566, %rd15, 1;
1221
+ add.s64 %rd567, %rd547, %rd566;
1222
+ shl.b64 %rd568, %rd544, 1;
1223
+ add.s64 %rd532, %rd567, %rd568;
1224
+ shl.b64 %rd569, %rd16, 1;
1225
+ add.s64 %rd570, %rd547, %rd569;
1226
+ shl.b64 %rd571, %rd545, 1;
1227
+ add.s64 %rd535, %rd570, %rd571;
1228
+ .loc 1 52 72 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:72
1229
+ // begin inline asm
1230
+ mov.u64 %rd513, 0x0;
1231
+ createpolicy.fractional.L2::evict_last.b64 %rd513, 1.0;
1232
+ // end inline asm
1233
+ mov.b16 %rs150, 0;
1234
+ // begin inline asm
1235
+ mov.u16 %rs149, %rs150;
1236
+ @%p17 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs149 }, [ %rd514 + 0 ], %rd513;
1237
+ // end inline asm
1238
+ // begin inline asm
1239
+ mov.u64 %rd516, 0x0;
1240
+ createpolicy.fractional.L2::evict_last.b64 %rd516, 1.0;
1241
+ // end inline asm
1242
+ // begin inline asm
1243
+ mov.u16 %rs151, %rs150;
1244
+ @%p18 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs151 }, [ %rd517 + 0 ], %rd516;
1245
+ // end inline asm
1246
+ // begin inline asm
1247
+ mov.u64 %rd519, 0x0;
1248
+ createpolicy.fractional.L2::evict_last.b64 %rd519, 1.0;
1249
+ // end inline asm
1250
+ // begin inline asm
1251
+ mov.u16 %rs153, %rs150;
1252
+ @%p19 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs153 }, [ %rd520 + 0 ], %rd519;
1253
+ // end inline asm
1254
+ // begin inline asm
1255
+ mov.u64 %rd522, 0x0;
1256
+ createpolicy.fractional.L2::evict_last.b64 %rd522, 1.0;
1257
+ // end inline asm
1258
+ // begin inline asm
1259
+ mov.u16 %rs155, %rs150;
1260
+ @%p20 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs155 }, [ %rd523 + 0 ], %rd522;
1261
+ // end inline asm
1262
+ // begin inline asm
1263
+ mov.u64 %rd525, 0x0;
1264
+ createpolicy.fractional.L2::evict_last.b64 %rd525, 1.0;
1265
+ // end inline asm
1266
+ // begin inline asm
1267
+ mov.u16 %rs157, %rs150;
1268
+ @%p21 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs157 }, [ %rd526 + 0 ], %rd525;
1269
+ // end inline asm
1270
+ // begin inline asm
1271
+ mov.u64 %rd528, 0x0;
1272
+ createpolicy.fractional.L2::evict_last.b64 %rd528, 1.0;
1273
+ // end inline asm
1274
+ // begin inline asm
1275
+ mov.u16 %rs159, %rs150;
1276
+ @%p22 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs159 }, [ %rd529 + 0 ], %rd528;
1277
+ // end inline asm
1278
+ // begin inline asm
1279
+ mov.u64 %rd531, 0x0;
1280
+ createpolicy.fractional.L2::evict_last.b64 %rd531, 1.0;
1281
+ // end inline asm
1282
+ // begin inline asm
1283
+ mov.u16 %rs161, %rs150;
1284
+ @%p23 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs161 }, [ %rd532 + 0 ], %rd531;
1285
+ // end inline asm
1286
+ // begin inline asm
1287
+ mov.u64 %rd534, 0x0;
1288
+ createpolicy.fractional.L2::evict_last.b64 %rd534, 1.0;
1289
+ // end inline asm
1290
+ // begin inline asm
1291
+ mov.u16 %rs163, %rs150;
1292
+ @%p24 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs163 }, [ %rd535 + 0 ], %rd534;
1293
+ // end inline asm
1294
+ .loc 1 61 35 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:61:35
1295
+ shr.s64 %rd572, %rd212, 63;
1296
+ and.b64 %rd573, %rd572, %rd146;
1297
+ shr.s64 %rd574, %rd216, 63;
1298
+ and.b64 %rd575, %rd574, %rd146;
1299
+ shr.s64 %rd576, %rd204, 63;
1300
+ and.b64 %rd577, %rd576, %rd146;
1301
+ shr.s64 %rd578, %rd208, 63;
1302
+ and.b64 %rd579, %rd578, %rd146;
1303
+ shr.s64 %rd580, %rd228, 63;
1304
+ and.b64 %rd581, %rd580, %rd146;
1305
+ shr.s64 %rd582, %rd232, 63;
1306
+ and.b64 %rd583, %rd582, %rd146;
1307
+ shr.s64 %rd584, %rd220, 63;
1308
+ and.b64 %rd585, %rd584, %rd146;
1309
+ shr.s64 %rd586, %rd224, 63;
1310
+ and.b64 %rd587, %rd586, %rd146;
1311
+ add.s64 %rd135, %rd587, %rd224;
1312
+ add.s64 %rd134, %rd585, %rd220;
1313
+ add.s64 %rd137, %rd583, %rd232;
1314
+ add.s64 %rd136, %rd581, %rd228;
1315
+ add.s64 %rd131, %rd579, %rd208;
1316
+ add.s64 %rd130, %rd577, %rd204;
1317
+ add.s64 %rd133, %rd575, %rd216;
1318
+ add.s64 %rd132, %rd573, %rd212;
1319
+ .loc 1 62 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:28
1320
+ setp.lt.s64 %p203, %rd132, 0;
1321
+ setp.lt.s64 %p204, %rd133, 0;
1322
+ setp.lt.s64 %p205, %rd130, 0;
1323
+ setp.lt.s64 %p206, %rd131, 0;
1324
+ setp.lt.s64 %p207, %rd136, 0;
1325
+ setp.lt.s64 %p208, %rd137, 0;
1326
+ setp.lt.s64 %p209, %rd134, 0;
1327
+ setp.lt.s64 %p210, %rd135, 0;
1328
+ .loc 1 62 46 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:46
1329
+ setp.ge.s64 %p211, %rd132, %rd146;
1330
+ setp.ge.s64 %p212, %rd133, %rd146;
1331
+ setp.ge.s64 %p213, %rd130, %rd146;
1332
+ setp.ge.s64 %p214, %rd131, %rd146;
1333
+ setp.ge.s64 %p215, %rd136, %rd146;
1334
+ setp.ge.s64 %p216, %rd137, %rd146;
1335
+ setp.ge.s64 %p217, %rd134, %rd146;
1336
+ setp.ge.s64 %p218, %rd135, %rd146;
1337
+ .loc 1 62 38 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:38
1338
+ or.pred %p219, %p210, %p218;
1339
+ or.pred %p220, %p209, %p217;
1340
+ or.pred %p221, %p208, %p216;
1341
+ or.pred %p222, %p207, %p215;
1342
+ or.pred %p223, %p206, %p214;
1343
+ or.pred %p224, %p205, %p213;
1344
+ or.pred %p225, %p204, %p212;
1345
+ or.pred %p226, %p203, %p211;
1346
+ .loc 1 62 54 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:54
1347
+ and.pred %p227, %p3, %p226;
1348
+ selp.b16 %rs165, 1, 0, %p227;
1349
+ shl.b16 %rs166, %rs165, 2;
1350
+ and.pred %p228, %p4, %p225;
1351
+ selp.b16 %rs167, -1, 0, %p228;
1352
+ shl.b16 %rs168, %rs167, 3;
1353
+ or.b16 %rs169, %rs168, %rs166;
1354
+ and.pred %p229, %p1, %p224;
1355
+ selp.b16 %rs170, 1, 0, %p229;
1356
+ and.pred %p230, %p2, %p223;
1357
+ selp.b16 %rs171, -1, 0, %p230;
1358
+ shl.b16 %rs172, %rs171, 1;
1359
+ or.b16 %rs173, %rs170, %rs172;
1360
+ and.b16 %rs174, %rs173, 3;
1361
+ or.b16 %rs175, %rs174, %rs169;
1362
+ and.b16 %rs176, %rs175, 15;
1363
+ and.pred %p231, %p7, %p222;
1364
+ selp.b16 %rs177, 1, 0, %p231;
1365
+ shl.b16 %rs178, %rs177, 2;
1366
+ and.pred %p232, %p8, %p221;
1367
+ selp.b16 %rs179, -1, 0, %p232;
1368
+ shl.b16 %rs180, %rs179, 3;
1369
+ or.b16 %rs181, %rs180, %rs178;
1370
+ and.pred %p233, %p5, %p220;
1371
+ selp.b16 %rs182, 1, 0, %p233;
1372
+ and.pred %p234, %p6, %p219;
1373
+ selp.b16 %rs183, -1, 0, %p234;
1374
+ shl.b16 %rs184, %rs183, 1;
1375
+ or.b16 %rs185, %rs182, %rs184;
1376
+ and.b16 %rs186, %rs185, 3;
1377
+ or.b16 %rs187, %rs186, %rs181;
1378
+ shl.b16 %rs188, %rs187, 4;
1379
+ or.b16 %rs189, %rs176, %rs188;
1380
+ .loc 1 62 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:64
1381
+ and.b16 %rs190, %rs189, 255;
1382
+ setp.eq.b16 %p235, %rs190, 0;
1383
+ @%p235 bra $L__BB0_54;
1384
+ bra.uni $L__BB0_53;
1385
+ $L__BB0_54:
1386
+ .loc 1 0 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0:64
1387
+ ld.param.b64 %rd142, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_4];
1388
+ ld.param.b64 %rd141, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_3];
1389
+ .loc 1 30 111 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:111
1390
+ cvt.f32.bf16 %r70, %rs63;
1391
+ .loc 1 37 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:123
1392
+ cvt.f32.bf16 %r71, %rs105;
1393
+ .loc 1 39 13 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:39:13
1394
+ neg.f32 %r72, %r70;
1395
+ fma.rn.f32 %r73, %r72, %r71, 0f00000000;
1396
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1397
+ selp.f32 %r74, %r73, 0f00000000, %p80;
1398
+ .loc 1 45 119 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:119
1399
+ cvt.f32.bf16 %r75, %rs121;
1400
+ .loc 1 52 131 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:131
1401
+ cvt.f32.bf16 %r76, %rs163;
1402
+ .loc 1 53 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:53:20
1403
+ mul.f32 %r77, %r75, %r76;
1404
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1405
+ selp.f32 %r78, %r77, 0f00000000, %p153;
1406
+ .loc 1 57 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:57:20
1407
+ add.f32 %r79, %r74, %r78;
1408
+ .loc 1 30 111 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:111
1409
+ cvt.f32.bf16 %r80, %rs61;
1410
+ .loc 1 37 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:123
1411
+ cvt.f32.bf16 %r81, %rs103;
1412
+ .loc 1 39 13 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:39:13
1413
+ neg.f32 %r82, %r80;
1414
+ fma.rn.f32 %r83, %r82, %r81, 0f00000000;
1415
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1416
+ selp.f32 %r84, %r83, 0f00000000, %p79;
1417
+ .loc 1 45 119 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:119
1418
+ cvt.f32.bf16 %r85, %rs119;
1419
+ .loc 1 52 131 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:131
1420
+ cvt.f32.bf16 %r86, %rs161;
1421
+ .loc 1 53 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:53:20
1422
+ mul.f32 %r87, %r85, %r86;
1423
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1424
+ selp.f32 %r88, %r87, 0f00000000, %p152;
1425
+ .loc 1 57 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:57:20
1426
+ add.f32 %r89, %r84, %r88;
1427
+ .loc 1 30 111 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:111
1428
+ cvt.f32.bf16 %r90, %rs59;
1429
+ .loc 1 37 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:123
1430
+ cvt.f32.bf16 %r91, %rs101;
1431
+ .loc 1 39 13 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:39:13
1432
+ neg.f32 %r92, %r90;
1433
+ fma.rn.f32 %r93, %r92, %r91, 0f00000000;
1434
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1435
+ selp.f32 %r94, %r93, 0f00000000, %p78;
1436
+ .loc 1 45 119 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:119
1437
+ cvt.f32.bf16 %r95, %rs117;
1438
+ .loc 1 52 131 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:131
1439
+ cvt.f32.bf16 %r96, %rs159;
1440
+ .loc 1 53 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:53:20
1441
+ mul.f32 %r97, %r95, %r96;
1442
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1443
+ selp.f32 %r98, %r97, 0f00000000, %p151;
1444
+ .loc 1 57 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:57:20
1445
+ add.f32 %r99, %r94, %r98;
1446
+ .loc 1 30 111 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:111
1447
+ cvt.f32.bf16 %r100, %rs57;
1448
+ .loc 1 37 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:123
1449
+ cvt.f32.bf16 %r101, %rs99;
1450
+ .loc 1 39 13 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:39:13
1451
+ neg.f32 %r102, %r100;
1452
+ fma.rn.f32 %r103, %r102, %r101, 0f00000000;
1453
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1454
+ selp.f32 %r104, %r103, 0f00000000, %p77;
1455
+ .loc 1 45 119 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:119
1456
+ cvt.f32.bf16 %r105, %rs115;
1457
+ .loc 1 52 131 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:131
1458
+ cvt.f32.bf16 %r106, %rs157;
1459
+ .loc 1 53 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:53:20
1460
+ mul.f32 %r107, %r105, %r106;
1461
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1462
+ selp.f32 %r108, %r107, 0f00000000, %p150;
1463
+ .loc 1 57 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:57:20
1464
+ add.f32 %r109, %r104, %r108;
1465
+ .loc 1 30 111 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:111
1466
+ cvt.f32.bf16 %r110, %rs55;
1467
+ .loc 1 37 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:123
1468
+ cvt.f32.bf16 %r111, %rs97;
1469
+ .loc 1 39 13 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:39:13
1470
+ neg.f32 %r112, %r110;
1471
+ fma.rn.f32 %r113, %r112, %r111, 0f00000000;
1472
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1473
+ selp.f32 %r114, %r113, 0f00000000, %p76;
1474
+ .loc 1 45 119 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:119
1475
+ cvt.f32.bf16 %r115, %rs113;
1476
+ .loc 1 52 131 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:131
1477
+ cvt.f32.bf16 %r116, %rs155;
1478
+ .loc 1 53 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:53:20
1479
+ mul.f32 %r117, %r115, %r116;
1480
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1481
+ selp.f32 %r118, %r117, 0f00000000, %p149;
1482
+ .loc 1 57 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:57:20
1483
+ add.f32 %r119, %r114, %r118;
1484
+ .loc 1 30 111 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:111
1485
+ cvt.f32.bf16 %r120, %rs53;
1486
+ .loc 1 37 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:123
1487
+ cvt.f32.bf16 %r121, %rs95;
1488
+ .loc 1 39 13 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:39:13
1489
+ neg.f32 %r122, %r120;
1490
+ fma.rn.f32 %r123, %r122, %r121, 0f00000000;
1491
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1492
+ selp.f32 %r124, %r123, 0f00000000, %p75;
1493
+ .loc 1 45 119 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:119
1494
+ cvt.f32.bf16 %r125, %rs111;
1495
+ .loc 1 52 131 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:131
1496
+ cvt.f32.bf16 %r126, %rs153;
1497
+ .loc 1 53 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:53:20
1498
+ mul.f32 %r127, %r125, %r126;
1499
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1500
+ selp.f32 %r128, %r127, 0f00000000, %p148;
1501
+ .loc 1 57 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:57:20
1502
+ add.f32 %r129, %r124, %r128;
1503
+ .loc 1 30 111 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:111
1504
+ cvt.f32.bf16 %r130, %rs51;
1505
+ .loc 1 37 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:123
1506
+ cvt.f32.bf16 %r131, %rs93;
1507
+ .loc 1 39 13 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:39:13
1508
+ neg.f32 %r132, %r130;
1509
+ fma.rn.f32 %r133, %r132, %r131, 0f00000000;
1510
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1511
+ selp.f32 %r134, %r133, 0f00000000, %p74;
1512
+ .loc 1 45 119 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:119
1513
+ cvt.f32.bf16 %r135, %rs109;
1514
+ .loc 1 52 131 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:131
1515
+ cvt.f32.bf16 %r136, %rs151;
1516
+ .loc 1 53 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:53:20
1517
+ mul.f32 %r137, %r135, %r136;
1518
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1519
+ selp.f32 %r138, %r137, 0f00000000, %p147;
1520
+ .loc 1 57 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:57:20
1521
+ add.f32 %r139, %r134, %r138;
1522
+ .loc 1 30 111 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:111
1523
+ cvt.f32.bf16 %r140, %rs49;
1524
+ .loc 1 37 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:123
1525
+ cvt.f32.bf16 %r141, %rs91;
1526
+ .loc 1 39 13 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:39:13
1527
+ neg.f32 %r142, %r140;
1528
+ fma.rn.f32 %r143, %r142, %r141, 0f00000000;
1529
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1530
+ selp.f32 %r144, %r143, 0f00000000, %p73;
1531
+ .loc 1 45 119 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:119
1532
+ cvt.f32.bf16 %r145, %rs107;
1533
+ .loc 1 52 131 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:131
1534
+ cvt.f32.bf16 %r146, %rs149;
1535
+ .loc 1 53 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:53:20
1536
+ mul.f32 %r147, %r145, %r146;
1537
+ .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0
1538
+ selp.f32 %r148, %r147, 0f00000000, %p146;
1539
+ .loc 1 57 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:57:20
1540
+ add.f32 %r149, %r144, %r148;
1541
+ .loc 1 25 76 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:25:76
1542
+ cvt.f32.bf16 %r150, %rs48;
1543
+ cvt.f32.bf16 %r151, %rs47;
1544
+ cvt.f32.bf16 %r152, %rs46;
1545
+ cvt.f32.bf16 %r153, %rs45;
1546
+ cvt.f32.bf16 %r154, %rs44;
1547
+ cvt.f32.bf16 %r155, %rs43;
1548
+ cvt.f32.bf16 %r156, %rs42;
1549
+ cvt.f32.bf16 %r157, %rs41;
1550
+ .loc 1 62 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:64
1551
+ bar.sync 0;
1552
+ .loc 1 63 40 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:63:40
1553
+ mul.lo.s64 %rd626, %rd130, %rd143;
1554
+ mul.lo.s64 %rd627, %rd131, %rd143;
1555
+ mul.lo.s64 %rd628, %rd132, %rd143;
1556
+ mul.lo.s64 %rd629, %rd133, %rd143;
1557
+ mul.lo.s64 %rd630, %rd134, %rd143;
1558
+ mul.lo.s64 %rd631, %rd135, %rd143;
1559
+ mul.lo.s64 %rd632, %rd136, %rd143;
1560
+ mul.lo.s64 %rd633, %rd137, %rd143;
1561
+ .loc 1 63 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:63:31
1562
+ add.s64 %rd635, %rd141, %rd548;
1563
+ shl.b64 %rd636, %rd626, 1;
1564
+ add.s64 %rd595, %rd635, %rd636;
1565
+ add.s64 %rd638, %rd141, %rd551;
1566
+ shl.b64 %rd639, %rd627, 1;
1567
+ add.s64 %rd598, %rd638, %rd639;
1568
+ add.s64 %rd641, %rd141, %rd554;
1569
+ shl.b64 %rd642, %rd628, 1;
1570
+ add.s64 %rd601, %rd641, %rd642;
1571
+ add.s64 %rd644, %rd141, %rd557;
1572
+ shl.b64 %rd645, %rd629, 1;
1573
+ add.s64 %rd604, %rd644, %rd645;
1574
+ add.s64 %rd647, %rd141, %rd560;
1575
+ shl.b64 %rd648, %rd630, 1;
1576
+ add.s64 %rd607, %rd647, %rd648;
1577
+ add.s64 %rd650, %rd141, %rd563;
1578
+ shl.b64 %rd651, %rd631, 1;
1579
+ add.s64 %rd610, %rd650, %rd651;
1580
+ add.s64 %rd653, %rd141, %rd566;
1581
+ shl.b64 %rd654, %rd632, 1;
1582
+ add.s64 %rd613, %rd653, %rd654;
1583
+ add.s64 %rd656, %rd141, %rd569;
1584
+ shl.b64 %rd657, %rd633, 1;
1585
+ add.s64 %rd616, %rd656, %rd657;
1586
+ .loc 1 63 48 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:63:48
1587
+ // begin inline asm
1588
+ mov.u64 %rd596, 0x0;
1589
+ createpolicy.fractional.L2::evict_last.b64 %rd596, 1.0;
1590
+ // end inline asm
1591
+ // begin inline asm
1592
+ mov.u16 %rs191, 0x0;
1593
+ @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs191 }, [ %rd595 + 0 ], %rd596;
1594
+ // end inline asm
1595
+ // begin inline asm
1596
+ mov.u64 %rd599, 0x0;
1597
+ createpolicy.fractional.L2::evict_last.b64 %rd599, 1.0;
1598
+ // end inline asm
1599
+ // begin inline asm
1600
+ mov.u16 %rs192, 0x0;
1601
+ @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs192 }, [ %rd598 + 0 ], %rd599;
1602
+ // end inline asm
1603
+ // begin inline asm
1604
+ mov.u64 %rd602, 0x0;
1605
+ createpolicy.fractional.L2::evict_last.b64 %rd602, 1.0;
1606
+ // end inline asm
1607
+ // begin inline asm
1608
+ mov.u16 %rs193, 0x0;
1609
+ @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs193 }, [ %rd601 + 0 ], %rd602;
1610
+ // end inline asm
1611
+ // begin inline asm
1612
+ mov.u64 %rd605, 0x0;
1613
+ createpolicy.fractional.L2::evict_last.b64 %rd605, 1.0;
1614
+ // end inline asm
1615
+ // begin inline asm
1616
+ mov.u16 %rs194, 0x0;
1617
+ @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs194 }, [ %rd604 + 0 ], %rd605;
1618
+ // end inline asm
1619
+ // begin inline asm
1620
+ mov.u64 %rd608, 0x0;
1621
+ createpolicy.fractional.L2::evict_last.b64 %rd608, 1.0;
1622
+ // end inline asm
1623
+ // begin inline asm
1624
+ mov.u16 %rs195, 0x0;
1625
+ @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs195 }, [ %rd607 + 0 ], %rd608;
1626
+ // end inline asm
1627
+ // begin inline asm
1628
+ mov.u64 %rd611, 0x0;
1629
+ createpolicy.fractional.L2::evict_last.b64 %rd611, 1.0;
1630
+ // end inline asm
1631
+ // begin inline asm
1632
+ mov.u16 %rs196, 0x0;
1633
+ @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs196 }, [ %rd610 + 0 ], %rd611;
1634
+ // end inline asm
1635
+ // begin inline asm
1636
+ mov.u64 %rd614, 0x0;
1637
+ createpolicy.fractional.L2::evict_last.b64 %rd614, 1.0;
1638
+ // end inline asm
1639
+ // begin inline asm
1640
+ mov.u16 %rs197, 0x0;
1641
+ @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs197 }, [ %rd613 + 0 ], %rd614;
1642
+ // end inline asm
1643
+ // begin inline asm
1644
+ mov.u64 %rd617, 0x0;
1645
+ createpolicy.fractional.L2::evict_last.b64 %rd617, 1.0;
1646
+ // end inline asm
1647
+ // begin inline asm
1648
+ mov.u16 %rs198, 0x0;
1649
+ @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs198 }, [ %rd616 + 0 ], %rd617;
1650
+ // end inline asm
1651
+ .loc 1 63 88 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:63:88
1652
+ cvt.f32.bf16 %r158, %rs191;
1653
+ cvt.f32.bf16 %r159, %rs192;
1654
+ cvt.f32.bf16 %r160, %rs193;
1655
+ cvt.f32.bf16 %r161, %rs194;
1656
+ cvt.f32.bf16 %r162, %rs195;
1657
+ cvt.f32.bf16 %r163, %rs196;
1658
+ cvt.f32.bf16 %r164, %rs197;
1659
+ cvt.f32.bf16 %r165, %rs198;
1660
+ .loc 1 65 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:65:20
1661
+ fma.rn.f32 %r166, %r157, %r158, %r149;
1662
+ fma.rn.f32 %r167, %r156, %r159, %r139;
1663
+ fma.rn.f32 %r168, %r155, %r160, %r129;
1664
+ fma.rn.f32 %r169, %r154, %r161, %r119;
1665
+ fma.rn.f32 %r170, %r153, %r162, %r109;
1666
+ fma.rn.f32 %r171, %r152, %r163, %r99;
1667
+ fma.rn.f32 %r172, %r151, %r164, %r89;
1668
+ fma.rn.f32 %r173, %r150, %r165, %r79;
1669
+ .loc 1 66 25 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:66:25
1670
+ add.s64 %rd618, %rd142, %rd291;
1671
+ add.s64 %rd619, %rd142, %rd292;
1672
+ add.s64 %rd620, %rd142, %rd293;
1673
+ add.s64 %rd621, %rd142, %rd294;
1674
+ add.s64 %rd622, %rd142, %rd295;
1675
+ add.s64 %rd623, %rd142, %rd296;
1676
+ add.s64 %rd624, %rd142, %rd297;
1677
+ add.s64 %rd625, %rd142, %rd298;
1678
+ .loc 1 66 37 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:66:37
1679
+ cvt.rn.bf16.f32 %rs199, %r166;
1680
+ cvt.rn.bf16.f32 %rs200, %r167;
1681
+ cvt.rn.bf16.f32 %rs201, %r168;
1682
+ cvt.rn.bf16.f32 %rs202, %r169;
1683
+ cvt.rn.bf16.f32 %rs203, %r170;
1684
+ cvt.rn.bf16.f32 %rs204, %r171;
1685
+ cvt.rn.bf16.f32 %rs205, %r172;
1686
+ cvt.rn.bf16.f32 %rs206, %r173;
1687
+ // begin inline asm
1688
+ @%p1 st.global.b16 [ %rd618 + 0 ], { %rs199 };
1689
+ // end inline asm
1690
+ // begin inline asm
1691
+ @%p2 st.global.b16 [ %rd619 + 0 ], { %rs200 };
1692
+ // end inline asm
1693
+ // begin inline asm
1694
+ @%p3 st.global.b16 [ %rd620 + 0 ], { %rs201 };
1695
+ // end inline asm
1696
+ // begin inline asm
1697
+ @%p4 st.global.b16 [ %rd621 + 0 ], { %rs202 };
1698
+ // end inline asm
1699
+ // begin inline asm
1700
+ @%p5 st.global.b16 [ %rd622 + 0 ], { %rs203 };
1701
+ // end inline asm
1702
+ // begin inline asm
1703
+ @%p6 st.global.b16 [ %rd623 + 0 ], { %rs204 };
1704
+ // end inline asm
1705
+ // begin inline asm
1706
+ @%p7 st.global.b16 [ %rd624 + 0 ], { %rs205 };
1707
+ // end inline asm
1708
+ // begin inline asm
1709
+ @%p8 st.global.b16 [ %rd625 + 0 ], { %rs206 };
1710
+ // end inline asm
1711
+ .loc 1 66 4 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:66:4
1712
+ ret;
1713
+ $L__BB0_49:
1714
+ .loc 1 36 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:123
1715
+ { // callseq 0, 0
1716
+ .param .b64 param0;
1717
+ .param .b64 param1;
1718
+ .param .b32 param2;
1719
+ .param .b64 param3;
1720
+ .param .b64 param4;
1721
+ mov.b64 %rd341, assertFunc_0;
1722
+ cvta.global.u64 %rd342, %rd341;
1723
+ st.param.b64 [param3], %rd342;
1724
+ mov.b64 %rd343, assertFile_0;
1725
+ cvta.global.u64 %rd344, %rd343;
1726
+ st.param.b64 [param1], %rd344;
1727
+ mov.b64 %rd345, assertMessage_0;
1728
+ cvta.global.u64 %rd346, %rd345;
1729
+ st.param.b64 [param0], %rd346;
1730
+ st.param.b64 [param4], 1;
1731
+ st.param.b32 [param2], 36;
1732
+ call.uni __assertfail, (param0, param1, param2, param3, param4);
1733
+ } // callseq 0
1734
+ trap;
1735
+ $L__BB0_51:
1736
+ .loc 1 51 126 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:126
1737
+ { // callseq 1, 0
1738
+ .param .b64 param0;
1739
+ .param .b64 param1;
1740
+ .param .b32 param2;
1741
+ .param .b64 param3;
1742
+ .param .b64 param4;
1743
+ mov.b64 %rd507, assertFunc_1;
1744
+ cvta.global.u64 %rd508, %rd507;
1745
+ st.param.b64 [param3], %rd508;
1746
+ mov.b64 %rd509, assertFile_1;
1747
+ cvta.global.u64 %rd510, %rd509;
1748
+ st.param.b64 [param1], %rd510;
1749
+ mov.b64 %rd511, assertMessage_1;
1750
+ cvta.global.u64 %rd512, %rd511;
1751
+ st.param.b64 [param0], %rd512;
1752
+ st.param.b64 [param4], 1;
1753
+ st.param.b32 [param2], 51;
1754
+ call.uni __assertfail, (param0, param1, param2, param3, param4);
1755
+ } // callseq 1
1756
+ trap;
1757
+ $L__BB0_53:
1758
+ .loc 1 62 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:64
1759
+ { // callseq 2, 0
1760
+ .param .b64 param0;
1761
+ .param .b64 param1;
1762
+ .param .b32 param2;
1763
+ .param .b64 param3;
1764
+ .param .b64 param4;
1765
+ mov.b64 %rd588, assertFunc_2;
1766
+ cvta.global.u64 %rd589, %rd588;
1767
+ st.param.b64 [param3], %rd589;
1768
+ mov.b64 %rd590, assertFile_2;
1769
+ cvta.global.u64 %rd591, %rd590;
1770
+ st.param.b64 [param1], %rd591;
1771
+ mov.b64 %rd592, assertMessage_2;
1772
+ cvta.global.u64 %rd593, %rd592;
1773
+ st.param.b64 [param0], %rd593;
1774
+ st.param.b64 [param4], 1;
1775
+ st.param.b32 [param2], 62;
1776
+ call.uni __assertfail, (param0, param1, param2, param3, param4);
1777
+ } // callseq 2
1778
+ trap;
1779
+ $L__tmp1:
1780
+ $L__func_end0:
1781
+ // -- End function
1782
+ }
1783
+ .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py"
1784
+ .section .debug_abbrev
1785
+ {
1786
+ .b8 1 // Abbreviation Code
1787
+ .b8 17 // DW_TAG_compile_unit
1788
+ .b8 0 // DW_CHILDREN_no
1789
+ .b8 37 // DW_AT_producer
1790
+ .b8 8 // DW_FORM_string
1791
+ .b8 19 // DW_AT_language
1792
+ .b8 5 // DW_FORM_data2
1793
+ .b8 3 // DW_AT_name
1794
+ .b8 8 // DW_FORM_string
1795
+ .b8 16 // DW_AT_stmt_list
1796
+ .b8 6 // DW_FORM_data4
1797
+ .b8 27 // DW_AT_comp_dir
1798
+ .b8 8 // DW_FORM_string
1799
+ .b8 0 // EOM(1)
1800
+ .b8 0 // EOM(2)
1801
+ .b8 0 // EOM(3)
1802
+ }
1803
+ .section .debug_info
1804
+ {
1805
+ .b32 135 // Length of Unit
1806
+ .b8 2 // DWARF version number
1807
+ .b8 0
1808
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
1809
+ .b8 8 // Address Size (in bytes)
1810
+ .b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit
1811
+ .b8 116 // DW_AT_producer
1812
+ .b8 114
1813
+ .b8 105
1814
+ .b8 116
1815
+ .b8 111
1816
+ .b8 110
1817
+ .b8 0
1818
+ .b8 2 // DW_AT_language
1819
+ .b8 0
1820
+ .b8 99 // DW_AT_name
1821
+ .b8 109
1822
+ .b8 98
1823
+ .b8 53
1824
+ .b8 122
1825
+ .b8 108
1826
+ .b8 108
1827
+ .b8 100
1828
+ .b8 107
1829
+ .b8 114
1830
+ .b8 102
1831
+ .b8 108
1832
+ .b8 101
1833
+ .b8 121
1834
+ .b8 104
1835
+ .b8 99
1836
+ .b8 107
1837
+ .b8 104
1838
+ .b8 105
1839
+ .b8 111
1840
+ .b8 109
1841
+ .b8 104
1842
+ .b8 112
1843
+ .b8 102
1844
+ .b8 115
1845
+ .b8 118
1846
+ .b8 107
1847
+ .b8 102
1848
+ .b8 107
1849
+ .b8 108
1850
+ .b8 108
1851
+ .b8 111
1852
+ .b8 116
1853
+ .b8 98
1854
+ .b8 111
1855
+ .b8 112
1856
+ .b8 97
1857
+ .b8 108
1858
+ .b8 100
1859
+ .b8 100
1860
+ .b8 119
1861
+ .b8 107
1862
+ .b8 100
1863
+ .b8 102
1864
+ .b8 104
1865
+ .b8 108
1866
+ .b8 105
1867
+ .b8 122
1868
+ .b8 118
1869
+ .b8 101
1870
+ .b8 117
1871
+ .b8 120
1872
+ .b8 46
1873
+ .b8 112
1874
+ .b8 121
1875
+ .b8 0
1876
+ .b32 .debug_line // DW_AT_stmt_list
1877
+ .b8 47 // DW_AT_comp_dir
1878
+ .b8 119
1879
+ .b8 111
1880
+ .b8 114
1881
+ .b8 107
1882
+ .b8 115
1883
+ .b8 112
1884
+ .b8 97
1885
+ .b8 99
1886
+ .b8 101
1887
+ .b8 47
1888
+ .b8 104
1889
+ .b8 97
1890
+ .b8 110
1891
+ .b8 114
1892
+ .b8 117
1893
+ .b8 105
1894
+ .b8 47
1895
+ .b8 83
1896
+ .b8 112
1897
+ .b8 101
1898
+ .b8 99
1899
+ .b8 70
1900
+ .b8 111
1901
+ .b8 114
1902
+ .b8 103
1903
+ .b8 101
1904
+ .b8 45
1905
+ .b8 101
1906
+ .b8 120
1907
+ .b8 116
1908
+ .b8 47
1909
+ .b8 99
1910
+ .b8 97
1911
+ .b8 99
1912
+ .b8 104
1913
+ .b8 101
1914
+ .b8 47
1915
+ .b8 99
1916
+ .b8 111
1917
+ .b8 109
1918
+ .b8 112
1919
+ .b8 105
1920
+ .b8 108
1921
+ .b8 101
1922
+ .b8 100
1923
+ .b8 95
1924
+ .b8 107
1925
+ .b8 101
1926
+ .b8 114
1927
+ .b8 110
1928
+ .b8 101
1929
+ .b8 108
1930
+ .b8 115
1931
+ .b8 47
1932
+ .b8 109
1933
+ .b8 98
1934
+ .b8 0
1935
+ }
1936
+ .section .debug_macinfo { }
SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":18:0)
2
+ #loc109 = loc("in_ptr0"(#loc))
3
+ #loc110 = loc("in_ptr1"(#loc))
4
+ #loc111 = loc("in_ptr2"(#loc))
5
+ #loc112 = loc("in_ptr3"(#loc))
6
+ #loc113 = loc("out_ptr0"(#loc))
7
+ #loc114 = loc("ks0"(#loc))
8
+ #loc115 = loc("ks1"(#loc))
9
+ #loc116 = loc("ks2"(#loc))
10
+ #loc117 = loc("ks3"(#loc))
11
+ #loc118 = loc("xnumel"(#loc))
12
+ module {
13
+ tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
14
+ %xoffset = tt.get_program_id x : i32 loc(#loc119)
15
+ %xoffset_0 = arith.constant 1024 : i32 loc(#loc120)
16
+ %xoffset_1 = arith.constant 1024 : i32 loc(#loc120)
17
+ %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc120)
18
+ %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc121)
19
+ %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32> loc(#loc122)
20
+ %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32> loc(#loc122)
21
+ %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32> loc(#loc123)
22
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<1024xi32> loc(#loc123)
23
+ %x0 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc124)
24
+ %x0_6 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc124)
25
+ %x0_7 = arith.remsi %x0, %x0_6 : tensor<1024xi64> loc(#loc124)
26
+ %x1 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc125)
27
+ %x1_8 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc125)
28
+ %x1_9 = arith.divsi %x1, %x1_8 : tensor<1024xi64> loc(#loc125)
29
+ %x1_10 = tt.splat %ks1 : i64 -> tensor<1024xi64> loc(#loc126)
30
+ %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<1024xi64> loc(#loc126)
31
+ %tmp31 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc127)
32
+ %tmp31_12 = tt.addptr %tmp31, %xindex_4 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc127)
33
+ %tmp31_13 = tt.load %tmp31_12, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc128)
34
+ %tmp31_14 = arith.extf %tmp31_13 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc129)
35
+ %tmp32 = tt.splat %in_ptr1 : !tt.ptr<i64> -> tensor<1024x!tt.ptr<i64>> loc(#loc130)
36
+ %tmp32_15 = tt.addptr %tmp32, %x1_11 : tensor<1024x!tt.ptr<i64>>, tensor<1024xi64> loc(#loc130)
37
+ %tmp32_16 = tt.load %tmp32_15, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr<i64>> loc(#loc131)
38
+ %tmp1 = arith.constant 2 : i32 loc(#loc132)
39
+ %tmp1_17 = arith.constant 2 : i64 loc(#loc132)
40
+ %tmp1_18 = arith.divsi %ks0, %tmp1_17 : i64 loc(#loc132)
41
+ %tmp2 = tt.splat %tmp1_18 : i64 -> tensor<1024xi64> loc(#loc133)
42
+ %tmp2_19 = arith.cmpi sge, %x0_7, %tmp2 : tensor<1024xi64> loc(#loc133)
43
+ %tmp3 = arith.constant 2 : i32 loc(#loc134)
44
+ %tmp3_20 = arith.constant 2 : i64 loc(#loc134)
45
+ %tmp3_21 = arith.divsi %ks0, %tmp3_20 : i64 loc(#loc134)
46
+ %tmp3_22 = arith.constant -1 : i32 loc(#loc135)
47
+ %tmp3_23 = arith.constant -1 : i64 loc(#loc135)
48
+ %tmp3_24 = arith.muli %tmp3_23, %tmp3_21 : i64 loc(#loc135)
49
+ %tmp3_25 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc136)
50
+ %tmp3_26 = tt.splat %tmp3_24 : i64 -> tensor<1024xi64> loc(#loc136)
51
+ %tmp3_27 = arith.addi %tmp3_25, %tmp3_26 : tensor<1024xi64> loc(#loc136)
52
+ %tmp3_28 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc137)
53
+ %tmp3_29 = tt.addptr %tmp3_28, %tmp3_27 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc137)
54
+ %tmp3_30 = arith.andi %tmp2_19, %xmask_5 : tensor<1024xi1> loc(#loc138)
55
+ %tmp3_31 = arith.constant 0.000000e+00 : f32 loc(#loc139)
56
+ %tmp3_32 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc139)
57
+ %tmp3_33 = arith.truncf %tmp3_32 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc139)
58
+ %tmp3_34 = tt.load %tmp3_29, %tmp3_30, %tmp3_33 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc139)
59
+ %tmp3_35 = arith.extf %tmp3_34 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc140)
60
+ %tmp4 = tt.splat %in_ptr1 : !tt.ptr<i64> -> tensor<1024x!tt.ptr<i64>> loc(#loc141)
61
+ %tmp4_36 = tt.addptr %tmp4, %x1_11 : tensor<1024x!tt.ptr<i64>>, tensor<1024xi64> loc(#loc141)
62
+ %tmp4_37 = arith.andi %tmp2_19, %xmask_5 : tensor<1024xi1> loc(#loc142)
63
+ %tmp4_38 = arith.constant 0.000000e+00 : f32 loc(#loc143)
64
+ %tmp4_39 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc143)
65
+ %tmp4_40 = arith.fptosi %tmp4_39 : tensor<1024xf32> to tensor<1024xi64> loc(#loc143)
66
+ %tmp4_41 = tt.load %tmp4_36, %tmp4_37, %tmp4_40 evictionPolicy = evict_last : tensor<1024x!tt.ptr<i64>> loc(#loc143)
67
+ %tmp5 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc144)
68
+ %tmp6 = arith.addi %tmp4_41, %tmp5 : tensor<1024xi64> loc(#loc145)
69
+ %tmp7 = arith.constant 0 : i32 loc(#loc146)
70
+ %tmp7_42 = arith.extsi %tmp7 : i32 to i64 loc(#loc146)
71
+ %tmp7_43 = tt.splat %tmp7_42 : i64 -> tensor<1024xi64> loc(#loc146)
72
+ %tmp7_44 = arith.cmpi slt, %tmp4_41, %tmp7_43 : tensor<1024xi64> loc(#loc146)
73
+ %tmp8 = arith.select %tmp7_44, %tmp6, %tmp4_41 : tensor<1024xi1>, tensor<1024xi64> loc(#loc147)
74
+ %c0_i32 = arith.constant 0 : i32 loc(#loc30)
75
+ %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc30)
76
+ %1 = tt.splat %0 : i64 -> tensor<1024xi64> loc(#loc30)
77
+ %2 = arith.cmpi sle, %1, %tmp8 : tensor<1024xi64> loc(#loc30)
78
+ %3 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc31)
79
+ %4 = arith.cmpi slt, %tmp8, %3 : tensor<1024xi64> loc(#loc31)
80
+ %5 = arith.andi %2, %4 : tensor<1024xi1> loc(#loc32)
81
+ %6 = arith.andi %tmp2_19, %xmask_5 : tensor<1024xi1> loc(#loc33)
82
+ %true = arith.constant true loc(#loc34)
83
+ %cst = arith.constant dense<true> : tensor<1024xi1> loc(#loc34)
84
+ %7 = arith.xori %6, %cst : tensor<1024xi1> loc(#loc34)
85
+ %8 = arith.ori %5, %7 : tensor<1024xi1> loc(#loc35)
86
+ tt.assert %8, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<1024xi1> loc(#loc36)
87
+ %tmp10 = arith.constant 2 : i32 loc(#loc148)
88
+ %tmp10_45 = arith.constant 2 : i64 loc(#loc148)
89
+ %tmp10_46 = arith.divsi %ks0, %tmp10_45 : i64 loc(#loc148)
90
+ %tmp10_47 = arith.constant -1 : i32 loc(#loc149)
91
+ %tmp10_48 = arith.constant -1 : i64 loc(#loc149)
92
+ %tmp10_49 = arith.muli %tmp10_48, %tmp10_46 : i64 loc(#loc149)
93
+ %tmp10_50 = tt.splat %tmp10_49 : i64 -> tensor<1024xi64> loc(#loc150)
94
+ %tmp10_51 = arith.addi %x0_7, %tmp10_50 : tensor<1024xi64> loc(#loc150)
95
+ %tmp10_52 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc151)
96
+ %tmp10_53 = arith.muli %tmp10_52, %tmp8 : tensor<1024xi64> loc(#loc151)
97
+ %tmp10_54 = arith.addi %tmp10_51, %tmp10_53 : tensor<1024xi64> loc(#loc152)
98
+ %tmp10_55 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc153)
99
+ %tmp10_56 = tt.addptr %tmp10_55, %tmp10_54 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc153)
100
+ %tmp10_57 = arith.andi %tmp2_19, %xmask_5 : tensor<1024xi1> loc(#loc154)
101
+ %tmp10_58 = arith.constant 0.000000e+00 : f32 loc(#loc155)
102
+ %tmp10_59 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc155)
103
+ %tmp10_60 = arith.truncf %tmp10_59 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc155)
104
+ %tmp10_61 = tt.load %tmp10_56, %tmp10_57, %tmp10_60 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc155)
105
+ %tmp10_62 = arith.extf %tmp10_61 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc156)
106
+ %tmp11 = arith.mulf %tmp3_35, %tmp10_62 : tensor<1024xf32> loc(#loc157)
107
+ %tmp12 = arith.constant 0.000000e+00 : f32 loc(#loc158)
108
+ %tmp12_63 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc158)
109
+ %tmp12_64 = arith.subf %tmp12_63, %tmp11 : tensor<1024xf32> loc(#loc158)
110
+ %tmp13 = arith.constant 0.000000e+00 : f32 loc(#loc159)
111
+ %tmp13_65 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc159)
112
+ %tmp14 = arith.select %tmp2_19, %tmp12_64, %tmp13_65 : tensor<1024xi1>, tensor<1024xf32> loc(#loc160)
113
+ %tmp15 = arith.constant 0.000000e+00 : f32 loc(#loc161)
114
+ %tmp16 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc162)
115
+ %tmp16_66 = arith.select %tmp2_19, %tmp14, %tmp16 : tensor<1024xi1>, tensor<1024xf32> loc(#loc162)
116
+ %tmp17 = tt.splat %tmp1_18 : i64 -> tensor<1024xi64> loc(#loc163)
117
+ %tmp17_67 = arith.cmpi slt, %x0_7, %tmp17 : tensor<1024xi64> loc(#loc163)
118
+ %tmp18 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc164)
119
+ %tmp18_68 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc164)
120
+ %tmp18_69 = arith.addi %tmp18_68, %tmp18 : tensor<1024xi64> loc(#loc164)
121
+ %tmp18_70 = arith.constant 2 : i32 loc(#loc165)
122
+ %tmp18_71 = arith.constant 2 : i64 loc(#loc165)
123
+ %tmp18_72 = arith.divsi %ks0, %tmp18_71 : i64 loc(#loc165)
124
+ %tmp18_73 = arith.constant -1 : i32 loc(#loc166)
125
+ %tmp18_74 = arith.constant -1 : i64 loc(#loc166)
126
+ %tmp18_75 = arith.muli %tmp18_74, %tmp18_72 : i64 loc(#loc166)
127
+ %tmp18_76 = tt.splat %tmp18_75 : i64 -> tensor<1024xi64> loc(#loc167)
128
+ %tmp18_77 = arith.addi %tmp18_69, %tmp18_76 : tensor<1024xi64> loc(#loc167)
129
+ %tmp18_78 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc168)
130
+ %tmp18_79 = tt.addptr %tmp18_78, %tmp18_77 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc168)
131
+ %tmp18_80 = arith.andi %tmp17_67, %xmask_5 : tensor<1024xi1> loc(#loc169)
132
+ %tmp18_81 = arith.constant 0.000000e+00 : f32 loc(#loc170)
133
+ %tmp18_82 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc170)
134
+ %tmp18_83 = arith.truncf %tmp18_82 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc170)
135
+ %tmp18_84 = tt.load %tmp18_79, %tmp18_80, %tmp18_83 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc170)
136
+ %tmp18_85 = arith.extf %tmp18_84 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc171)
137
+ %tmp19 = tt.splat %in_ptr1 : !tt.ptr<i64> -> tensor<1024x!tt.ptr<i64>> loc(#loc172)
138
+ %tmp19_86 = tt.addptr %tmp19, %x1_11 : tensor<1024x!tt.ptr<i64>>, tensor<1024xi64> loc(#loc172)
139
+ %tmp19_87 = arith.andi %tmp17_67, %xmask_5 : tensor<1024xi1> loc(#loc173)
140
+ %tmp19_88 = arith.constant 0.000000e+00 : f32 loc(#loc174)
141
+ %tmp19_89 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc174)
142
+ %tmp19_90 = arith.fptosi %tmp19_89 : tensor<1024xf32> to tensor<1024xi64> loc(#loc174)
143
+ %tmp19_91 = tt.load %tmp19_86, %tmp19_87, %tmp19_90 evictionPolicy = evict_last : tensor<1024x!tt.ptr<i64>> loc(#loc174)
144
+ %tmp20 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc175)
145
+ %tmp21 = arith.addi %tmp19_91, %tmp20 : tensor<1024xi64> loc(#loc176)
146
+ %tmp22 = arith.constant 0 : i32 loc(#loc177)
147
+ %tmp22_92 = arith.extsi %tmp22 : i32 to i64 loc(#loc177)
148
+ %tmp22_93 = tt.splat %tmp22_92 : i64 -> tensor<1024xi64> loc(#loc177)
149
+ %tmp22_94 = arith.cmpi slt, %tmp19_91, %tmp22_93 : tensor<1024xi64> loc(#loc177)
150
+ %tmp23 = arith.select %tmp22_94, %tmp21, %tmp19_91 : tensor<1024xi1>, tensor<1024xi64> loc(#loc178)
151
+ %c0_i32_95 = arith.constant 0 : i32 loc(#loc68)
152
+ %9 = arith.extsi %c0_i32_95 : i32 to i64 loc(#loc68)
153
+ %10 = tt.splat %9 : i64 -> tensor<1024xi64> loc(#loc68)
154
+ %11 = arith.cmpi sle, %10, %tmp23 : tensor<1024xi64> loc(#loc68)
155
+ %12 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc69)
156
+ %13 = arith.cmpi slt, %tmp23, %12 : tensor<1024xi64> loc(#loc69)
157
+ %14 = arith.andi %11, %13 : tensor<1024xi1> loc(#loc70)
158
+ %15 = arith.andi %tmp17_67, %xmask_5 : tensor<1024xi1> loc(#loc71)
159
+ %true_96 = arith.constant true loc(#loc72)
160
+ %cst_97 = arith.constant dense<true> : tensor<1024xi1> loc(#loc72)
161
+ %16 = arith.xori %15, %cst_97 : tensor<1024xi1> loc(#loc72)
162
+ %17 = arith.ori %14, %16 : tensor<1024xi1> loc(#loc73)
163
+ tt.assert %17, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<1024xi1> loc(#loc74)
164
+ %tmp25 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc179)
165
+ %tmp25_98 = arith.addi %tmp25, %x0_7 : tensor<1024xi64> loc(#loc179)
166
+ %tmp25_99 = arith.constant 2 : i32 loc(#loc180)
167
+ %tmp25_100 = arith.constant 2 : i64 loc(#loc180)
168
+ %tmp25_101 = arith.divsi %ks0, %tmp25_100 : i64 loc(#loc180)
169
+ %tmp25_102 = arith.constant -1 : i32 loc(#loc181)
170
+ %tmp25_103 = arith.constant -1 : i64 loc(#loc181)
171
+ %tmp25_104 = arith.muli %tmp25_103, %tmp25_101 : i64 loc(#loc181)
172
+ %tmp25_105 = tt.splat %tmp25_104 : i64 -> tensor<1024xi64> loc(#loc182)
173
+ %tmp25_106 = arith.addi %tmp25_98, %tmp25_105 : tensor<1024xi64> loc(#loc182)
174
+ %tmp25_107 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc183)
175
+ %tmp25_108 = arith.muli %tmp25_107, %tmp23 : tensor<1024xi64> loc(#loc183)
176
+ %tmp25_109 = arith.addi %tmp25_106, %tmp25_108 : tensor<1024xi64> loc(#loc184)
177
+ %tmp25_110 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc185)
178
+ %tmp25_111 = tt.addptr %tmp25_110, %tmp25_109 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc185)
179
+ %tmp25_112 = arith.andi %tmp17_67, %xmask_5 : tensor<1024xi1> loc(#loc186)
180
+ %tmp25_113 = arith.constant 0.000000e+00 : f32 loc(#loc187)
181
+ %tmp25_114 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc187)
182
+ %tmp25_115 = arith.truncf %tmp25_114 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc187)
183
+ %tmp25_116 = tt.load %tmp25_111, %tmp25_112, %tmp25_115 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc187)
184
+ %tmp25_117 = arith.extf %tmp25_116 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc188)
185
+ %tmp26 = arith.mulf %tmp18_85, %tmp25_117 : tensor<1024xf32> loc(#loc189)
186
+ %tmp27 = arith.constant 0.000000e+00 : f32 loc(#loc190)
187
+ %tmp27_118 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc190)
188
+ %tmp28 = arith.select %tmp17_67, %tmp26, %tmp27_118 : tensor<1024xi1>, tensor<1024xf32> loc(#loc191)
189
+ %tmp29 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc192)
190
+ %tmp29_119 = arith.select %tmp17_67, %tmp28, %tmp29 : tensor<1024xi1>, tensor<1024xf32> loc(#loc192)
191
+ %tmp30 = arith.addf %tmp16_66, %tmp29_119 : tensor<1024xf32> loc(#loc193)
192
+ %tmp34 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc194)
193
+ %tmp34_120 = arith.addi %tmp32_16, %tmp34 : tensor<1024xi64> loc(#loc194)
194
+ %tmp35 = arith.constant 0 : i32 loc(#loc195)
195
+ %tmp35_121 = arith.extsi %tmp35 : i32 to i64 loc(#loc195)
196
+ %tmp35_122 = tt.splat %tmp35_121 : i64 -> tensor<1024xi64> loc(#loc195)
197
+ %tmp35_123 = arith.cmpi slt, %tmp32_16, %tmp35_122 : tensor<1024xi64> loc(#loc195)
198
+ %tmp36 = arith.select %tmp35_123, %tmp34_120, %tmp32_16 : tensor<1024xi1>, tensor<1024xi64> loc(#loc196)
199
+ %c0_i32_124 = arith.constant 0 : i32 loc(#loc93)
200
+ %18 = arith.extsi %c0_i32_124 : i32 to i64 loc(#loc93)
201
+ %19 = tt.splat %18 : i64 -> tensor<1024xi64> loc(#loc93)
202
+ %20 = arith.cmpi sle, %19, %tmp36 : tensor<1024xi64> loc(#loc93)
203
+ %21 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc94)
204
+ %22 = arith.cmpi slt, %tmp36, %21 : tensor<1024xi64> loc(#loc94)
205
+ %23 = arith.andi %20, %22 : tensor<1024xi1> loc(#loc95)
206
+ %true_125 = arith.constant true loc(#loc96)
207
+ %cst_126 = arith.constant dense<true> : tensor<1024xi1> loc(#loc96)
208
+ %24 = arith.xori %xmask_5, %cst_126 : tensor<1024xi1> loc(#loc96)
209
+ %25 = arith.ori %23, %24 : tensor<1024xi1> loc(#loc97)
210
+ tt.assert %25, "index out of bounds: 0 <= tmp36 < ks3" : tensor<1024xi1> loc(#loc98)
211
+ %tmp38 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc197)
212
+ %tmp38_127 = arith.muli %tmp38, %tmp36 : tensor<1024xi64> loc(#loc197)
213
+ %tmp38_128 = arith.addi %x0_7, %tmp38_127 : tensor<1024xi64> loc(#loc198)
214
+ %tmp38_129 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc199)
215
+ %tmp38_130 = tt.addptr %tmp38_129, %tmp38_128 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc199)
216
+ %tmp38_131 = tt.load %tmp38_130, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc200)
217
+ %tmp38_132 = arith.extf %tmp38_131 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc201)
218
+ %tmp39 = arith.mulf %tmp31_14, %tmp38_132 : tensor<1024xf32> loc(#loc202)
219
+ %tmp40 = arith.addf %tmp30, %tmp39 : tensor<1024xf32> loc(#loc203)
220
+ %26 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc106)
221
+ %27 = tt.addptr %26, %xindex_4 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc106)
222
+ %28 = arith.truncf %tmp40 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc107)
223
+ tt.store %27, %28, %xmask_5 : tensor<1024x!tt.ptr<bf16>> loc(#loc107)
224
+ tt.return loc(#loc108)
225
+ } loc(#loc)
226
+ } loc(#loc)
227
+ #loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:28)
228
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:33)
229
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:36)
230
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:23)
231
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":21:21)
232
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":22:19)
233
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:21)
234
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:28)
235
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:31)
236
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:36)
237
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:76)
238
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:31)
239
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:36)
240
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":28:18)
241
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":29:19)
242
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:48)
243
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:41)
244
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:35)
245
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:30)
246
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:60)
247
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:53)
248
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:111)
249
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":31:30)
250
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":31:42)
251
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":31:35)
252
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":32:32)
253
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":33:18)
254
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":34:18)
255
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":35:32)
256
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:28)
257
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:98)
258
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:64)
259
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:115)
260
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:108)
261
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:106)
262
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:123)
263
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:49)
264
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:42)
265
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:36)
266
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:58)
267
+ #loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:54)
268
+ #loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:31)
269
+ #loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:72)
270
+ #loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:65)
271
+ #loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:123)
272
+ #loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":38:19)
273
+ #loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":39:13)
274
+ #loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":40:38)
275
+ #loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":41:34)
276
+ #loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":42:12)
277
+ #loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":43:34)
278
+ #loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":44:19)
279
+ #loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:37)
280
+ #loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:55)
281
+ #loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:48)
282
+ #loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:42)
283
+ #loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:31)
284
+ #loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:68)
285
+ #loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:60)
286
+ #loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:119)
287
+ #loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":46:31)
288
+ #loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":46:44)
289
+ #loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":46:36)
290
+ #loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":47:33)
291
+ #loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":48:20)
292
+ #loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":49:20)
293
+ #loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":50:35)
294
+ #loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:28)
295
+ #loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:100)
296
+ #loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:65)
297
+ #loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:118)
298
+ #loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:110)
299
+ #loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:108)
300
+ #loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:126)
301
+ #loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:37)
302
+ #loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:55)
303
+ #loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:48)
304
+ #loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:42)
305
+ #loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:64)
306
+ #loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:60)
307
+ #loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:31)
308
+ #loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:80)
309
+ #loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:72)
310
+ #loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:131)
311
+ #loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":53:20)
312
+ #loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":54:38)
313
+ #loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":55:35)
314
+ #loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":56:35)
315
+ #loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":57:20)
316
+ #loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":59:20)
317
+ #loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":60:20)
318
+ #loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":61:35)
319
+ #loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:28)
320
+ #loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:46)
321
+ #loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:38)
322
+ #loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:56)
323
+ #loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:54)
324
+ #loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:64)
325
+ #loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:40)
326
+ #loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:36)
327
+ #loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:31)
328
+ #loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:48)
329
+ #loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:88)
330
+ #loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":64:20)
331
+ #loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":65:20)
332
+ #loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:25)
333
+ #loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:37)
334
+ #loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:4)
335
+ #loc119 = loc("xoffset"(#loc1))
336
+ #loc120 = loc("xoffset"(#loc2))
337
+ #loc121 = loc("xindex"(#loc3))
338
+ #loc122 = loc("xindex"(#loc4))
339
+ #loc123 = loc("xmask"(#loc5))
340
+ #loc124 = loc("x0"(#loc6))
341
+ #loc125 = loc("x1"(#loc7))
342
+ #loc126 = loc("x1"(#loc8))
343
+ #loc127 = loc("tmp31"(#loc9))
344
+ #loc128 = loc("tmp31"(#loc10))
345
+ #loc129 = loc("tmp31"(#loc11))
346
+ #loc130 = loc("tmp32"(#loc12))
347
+ #loc131 = loc("tmp32"(#loc13))
348
+ #loc132 = loc("tmp1"(#loc14))
349
+ #loc133 = loc("tmp2"(#loc15))
350
+ #loc134 = loc("tmp3"(#loc16))
351
+ #loc135 = loc("tmp3"(#loc17))
352
+ #loc136 = loc("tmp3"(#loc18))
353
+ #loc137 = loc("tmp3"(#loc19))
354
+ #loc138 = loc("tmp3"(#loc20))
355
+ #loc139 = loc("tmp3"(#loc21))
356
+ #loc140 = loc("tmp3"(#loc22))
357
+ #loc141 = loc("tmp4"(#loc23))
358
+ #loc142 = loc("tmp4"(#loc24))
359
+ #loc143 = loc("tmp4"(#loc25))
360
+ #loc144 = loc("tmp5"(#loc26))
361
+ #loc145 = loc("tmp6"(#loc27))
362
+ #loc146 = loc("tmp7"(#loc28))
363
+ #loc147 = loc("tmp8"(#loc29))
364
+ #loc148 = loc("tmp10"(#loc37))
365
+ #loc149 = loc("tmp10"(#loc38))
366
+ #loc150 = loc("tmp10"(#loc39))
367
+ #loc151 = loc("tmp10"(#loc40))
368
+ #loc152 = loc("tmp10"(#loc41))
369
+ #loc153 = loc("tmp10"(#loc42))
370
+ #loc154 = loc("tmp10"(#loc43))
371
+ #loc155 = loc("tmp10"(#loc44))
372
+ #loc156 = loc("tmp10"(#loc45))
373
+ #loc157 = loc("tmp11"(#loc46))
374
+ #loc158 = loc("tmp12"(#loc47))
375
+ #loc159 = loc("tmp13"(#loc48))
376
+ #loc160 = loc("tmp14"(#loc49))
377
+ #loc161 = loc("tmp15"(#loc50))
378
+ #loc162 = loc("tmp16"(#loc51))
379
+ #loc163 = loc("tmp17"(#loc52))
380
+ #loc164 = loc("tmp18"(#loc53))
381
+ #loc165 = loc("tmp18"(#loc54))
382
+ #loc166 = loc("tmp18"(#loc55))
383
+ #loc167 = loc("tmp18"(#loc56))
384
+ #loc168 = loc("tmp18"(#loc57))
385
+ #loc169 = loc("tmp18"(#loc58))
386
+ #loc170 = loc("tmp18"(#loc59))
387
+ #loc171 = loc("tmp18"(#loc60))
388
+ #loc172 = loc("tmp19"(#loc61))
389
+ #loc173 = loc("tmp19"(#loc62))
390
+ #loc174 = loc("tmp19"(#loc63))
391
+ #loc175 = loc("tmp20"(#loc64))
392
+ #loc176 = loc("tmp21"(#loc65))
393
+ #loc177 = loc("tmp22"(#loc66))
394
+ #loc178 = loc("tmp23"(#loc67))
395
+ #loc179 = loc("tmp25"(#loc75))
396
+ #loc180 = loc("tmp25"(#loc76))
397
+ #loc181 = loc("tmp25"(#loc77))
398
+ #loc182 = loc("tmp25"(#loc78))
399
+ #loc183 = loc("tmp25"(#loc79))
400
+ #loc184 = loc("tmp25"(#loc80))
401
+ #loc185 = loc("tmp25"(#loc81))
402
+ #loc186 = loc("tmp25"(#loc82))
403
+ #loc187 = loc("tmp25"(#loc83))
404
+ #loc188 = loc("tmp25"(#loc84))
405
+ #loc189 = loc("tmp26"(#loc85))
406
+ #loc190 = loc("tmp27"(#loc86))
407
+ #loc191 = loc("tmp28"(#loc87))
408
+ #loc192 = loc("tmp29"(#loc88))
409
+ #loc193 = loc("tmp30"(#loc89))
410
+ #loc194 = loc("tmp34"(#loc90))
411
+ #loc195 = loc("tmp35"(#loc91))
412
+ #loc196 = loc("tmp36"(#loc92))
413
+ #loc197 = loc("tmp38"(#loc99))
414
+ #loc198 = loc("tmp38"(#loc100))
415
+ #loc199 = loc("tmp38"(#loc101))
416
+ #loc200 = loc("tmp38"(#loc102))
417
+ #loc201 = loc("tmp38"(#loc103))
418
+ #loc202 = loc("tmp39"(#loc104))
419
+ #loc203 = loc("tmp40"(#loc105))
SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
2
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":18:0)
3
+ #loc91 = loc("in_ptr0"(#loc))
4
+ #loc92 = loc("in_ptr1"(#loc))
5
+ #loc93 = loc("in_ptr2"(#loc))
6
+ #loc94 = loc("in_ptr3"(#loc))
7
+ #loc95 = loc("out_ptr0"(#loc))
8
+ #loc96 = loc("ks0"(#loc))
9
+ #loc97 = loc("ks1"(#loc))
10
+ #loc98 = loc("ks2"(#loc))
11
+ #loc99 = loc("ks3"(#loc))
12
+ #loc100 = loc("xnumel"(#loc))
13
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
14
+ tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
15
+ %cst = arith.constant dense<true> : tensor<1024xi1, #blocked> loc(#loc1)
16
+ %c-1_i64 = arith.constant -1 : i64 loc(#loc1)
17
+ %c2_i64 = arith.constant 2 : i64 loc(#loc1)
18
+ %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
19
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1)
20
+ %cst_1 = arith.constant dense<0> : tensor<1024xi64, #blocked> loc(#loc1)
21
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1)
22
+ %xoffset = tt.get_program_id x : i32 loc(#loc101)
23
+ %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc102)
24
+ %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc103)
25
+ %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc104)
26
+ %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc104)
27
+ %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32, #blocked> loc(#loc105)
28
+ %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1024xi32, #blocked> loc(#loc105)
29
+ %x0 = arith.extsi %xindex_5 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc106)
30
+ %x0_7 = tt.splat %ks0 : i64 -> tensor<1024xi64, #blocked> loc(#loc106)
31
+ %x0_8 = arith.remsi %x0, %x0_7 : tensor<1024xi64, #blocked> loc(#loc106)
32
+ %x1 = arith.divsi %x0, %x0_7 : tensor<1024xi64, #blocked> loc(#loc107)
33
+ %x1_9 = tt.splat %ks1 : i64 -> tensor<1024xi64, #blocked> loc(#loc108)
34
+ %x1_10 = arith.remsi %x1, %x1_9 : tensor<1024xi64, #blocked> loc(#loc108)
35
+ %tmp31 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc109)
36
+ %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc109)
37
+ %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc110)
38
+ %tmp31_13 = arith.extf %tmp31_12 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc111)
39
+ %tmp32 = tt.splat %in_ptr1 : !tt.ptr<i64> -> tensor<1024x!tt.ptr<i64>, #blocked> loc(#loc112)
40
+ %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<1024x!tt.ptr<i64>, #blocked>, tensor<1024xi64, #blocked> loc(#loc112)
41
+ %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr<i64>, #blocked> loc(#loc113)
42
+ %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114)
43
+ %tmp2 = tt.splat %tmp1 : i64 -> tensor<1024xi64, #blocked> loc(#loc115)
44
+ %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<1024xi64, #blocked> loc(#loc115)
45
+ %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116)
46
+ %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<1024xi64, #blocked> loc(#loc117)
47
+ %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc117)
48
+ %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi64, #blocked> loc(#loc118)
49
+ %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc119)
50
+ %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc120)
51
+ %tmp3_22 = arith.extf %tmp3_21 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc121)
52
+ %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst_1 evictionPolicy = evict_last : tensor<1024x!tt.ptr<i64>, #blocked> loc(#loc122)
53
+ %tmp5 = tt.splat %ks2 : i64 -> tensor<1024xi64, #blocked> loc(#loc123)
54
+ %tmp6 = arith.addi %tmp4, %tmp5 : tensor<1024xi64, #blocked> loc(#loc124)
55
+ %tmp7 = arith.cmpi slt, %tmp4, %cst_1 : tensor<1024xi64, #blocked> loc(#loc125)
56
+ %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc126)
57
+ %0 = arith.cmpi sge, %tmp8, %cst_1 : tensor<1024xi64, #blocked> loc(#loc28)
58
+ %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<1024xi64, #blocked> loc(#loc29)
59
+ %2 = arith.andi %0, %1 : tensor<1024xi1, #blocked> loc(#loc30)
60
+ %3 = arith.xori %tmp3_20, %cst : tensor<1024xi1, #blocked> loc(#loc31)
61
+ %4 = arith.ori %2, %3 : tensor<1024xi1, #blocked> loc(#loc32)
62
+ tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<1024xi1, #blocked> loc(#loc33)
63
+ %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc127)
64
+ %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<1024xi64, #blocked> loc(#loc128)
65
+ %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<1024xi64, #blocked> loc(#loc129)
66
+ %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc130)
67
+ %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi64, #blocked> loc(#loc130)
68
+ %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc131)
69
+ %tmp10_28 = arith.extf %tmp10_27 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc132)
70
+ %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<1024xf32, #blocked> loc(#loc133)
71
+ %tmp12 = arith.subf %cst_2, %tmp11 : tensor<1024xf32, #blocked> loc(#loc134)
72
+ %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc169)
73
+ %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<1024xi64, #blocked> loc(#loc137)
74
+ %tmp18 = arith.addi %x0_7, %x0 : tensor<1024xi64, #blocked> loc(#loc138)
75
+ %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc139)
76
+ %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi64, #blocked> loc(#loc140)
77
+ %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc141)
78
+ %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc142)
79
+ %tmp18_33 = arith.extf %tmp18_32 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc143)
80
+ %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst_1 evictionPolicy = evict_last : tensor<1024x!tt.ptr<i64>, #blocked> loc(#loc144)
81
+ %tmp21 = arith.addi %tmp19, %tmp5 : tensor<1024xi64, #blocked> loc(#loc145)
82
+ %tmp22 = arith.cmpi slt, %tmp19, %cst_1 : tensor<1024xi64, #blocked> loc(#loc146)
83
+ %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc147)
84
+ %5 = arith.cmpi sge, %tmp23, %cst_1 : tensor<1024xi64, #blocked> loc(#loc55)
85
+ %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<1024xi64, #blocked> loc(#loc56)
86
+ %7 = arith.andi %5, %6 : tensor<1024xi1, #blocked> loc(#loc57)
87
+ %8 = arith.xori %tmp18_31, %cst : tensor<1024xi1, #blocked> loc(#loc58)
88
+ %9 = arith.ori %7, %8 : tensor<1024xi1, #blocked> loc(#loc59)
89
+ tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<1024xi1, #blocked> loc(#loc60)
90
+ %tmp25 = arith.addi %x0_7, %x0_8 : tensor<1024xi64, #blocked> loc(#loc148)
91
+ %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc149)
92
+ %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<1024xi64, #blocked> loc(#loc150)
93
+ %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<1024xi64, #blocked> loc(#loc151)
94
+ %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi64, #blocked> loc(#loc152)
95
+ %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc153)
96
+ %tmp25_39 = arith.extf %tmp25_38 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc154)
97
+ %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<1024xf32, #blocked> loc(#loc155)
98
+ %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc170)
99
+ %tmp30 = arith.addf %tmp16, %tmp29 : tensor<1024xf32, #blocked> loc(#loc158)
100
+ %tmp34 = tt.splat %ks3 : i64 -> tensor<1024xi64, #blocked> loc(#loc159)
101
+ %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<1024xi64, #blocked> loc(#loc159)
102
+ %tmp35 = arith.cmpi slt, %tmp32_15, %cst_1 : tensor<1024xi64, #blocked> loc(#loc160)
103
+ %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc161)
104
+ %10 = arith.cmpi sge, %tmp36, %cst_1 : tensor<1024xi64, #blocked> loc(#loc75)
105
+ %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<1024xi64, #blocked> loc(#loc76)
106
+ %12 = arith.andi %10, %11 : tensor<1024xi1, #blocked> loc(#loc77)
107
+ %13 = arith.xori %xmask_6, %cst : tensor<1024xi1, #blocked> loc(#loc78)
108
+ %14 = arith.ori %12, %13 : tensor<1024xi1, #blocked> loc(#loc79)
109
+ tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<1024xi1, #blocked> loc(#loc80)
110
+ %tmp38 = arith.muli %x0_7, %tmp36 : tensor<1024xi64, #blocked> loc(#loc162)
111
+ %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<1024xi64, #blocked> loc(#loc163)
112
+ %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc164)
113
+ %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi64, #blocked> loc(#loc164)
114
+ %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc165)
115
+ %tmp38_45 = arith.extf %tmp38_44 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc166)
116
+ %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<1024xf32, #blocked> loc(#loc167)
117
+ %tmp40 = arith.addf %tmp30, %tmp39 : tensor<1024xf32, #blocked> loc(#loc168)
118
+ %15 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc88)
119
+ %16 = tt.addptr %15, %xindex_5 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc88)
120
+ %17 = arith.truncf %tmp40 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc89)
121
+ tt.store %16, %17, %xmask_6 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc89)
122
+ tt.return loc(#loc90)
123
+ } loc(#loc)
124
+ } loc(#loc)
125
+ #loc1 = loc(unknown)
126
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:28)
127
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:33)
128
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:36)
129
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:23)
130
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":21:21)
131
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":22:19)
132
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:21)
133
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:28)
134
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:31)
135
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:36)
136
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:76)
137
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:31)
138
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:36)
139
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":28:18)
140
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":29:19)
141
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:41)
142
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:35)
143
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:30)
144
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:60)
145
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:53)
146
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:111)
147
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":31:35)
148
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":32:32)
149
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":33:18)
150
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":34:18)
151
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":35:32)
152
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:28)
153
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:98)
154
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:64)
155
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:108)
156
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:106)
157
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:123)
158
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:36)
159
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:58)
160
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:54)
161
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:31)
162
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:65)
163
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:123)
164
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":38:19)
165
+ #loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":39:13)
166
+ #loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":43:34)
167
+ #loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":41:34)
168
+ #loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":44:19)
169
+ #loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:37)
170
+ #loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:42)
171
+ #loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:31)
172
+ #loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:68)
173
+ #loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:60)
174
+ #loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:119)
175
+ #loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":46:36)
176
+ #loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":48:20)
177
+ #loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":49:20)
178
+ #loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":50:35)
179
+ #loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:28)
180
+ #loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:100)
181
+ #loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:65)
182
+ #loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:110)
183
+ #loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:108)
184
+ #loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:126)
185
+ #loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:37)
186
+ #loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:42)
187
+ #loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:64)
188
+ #loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:60)
189
+ #loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:31)
190
+ #loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:72)
191
+ #loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:131)
192
+ #loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":53:20)
193
+ #loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":56:35)
194
+ #loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":55:35)
195
+ #loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":57:20)
196
+ #loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":59:20)
197
+ #loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":60:20)
198
+ #loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":61:35)
199
+ #loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:28)
200
+ #loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:46)
201
+ #loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:38)
202
+ #loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:56)
203
+ #loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:54)
204
+ #loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:64)
205
+ #loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:40)
206
+ #loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:36)
207
+ #loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:31)
208
+ #loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:48)
209
+ #loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:88)
210
+ #loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":64:20)
211
+ #loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":65:20)
212
+ #loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:25)
213
+ #loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:37)
214
+ #loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:4)
215
+ #loc101 = loc("xoffset"(#loc2))
216
+ #loc102 = loc("xoffset"(#loc3))
217
+ #loc103 = loc("xindex"(#loc4))
218
+ #loc104 = loc("xindex"(#loc5))
219
+ #loc105 = loc("xmask"(#loc6))
220
+ #loc106 = loc("x0"(#loc7))
221
+ #loc107 = loc("x1"(#loc8))
222
+ #loc108 = loc("x1"(#loc9))
223
+ #loc109 = loc("tmp31"(#loc10))
224
+ #loc110 = loc("tmp31"(#loc11))
225
+ #loc111 = loc("tmp31"(#loc12))
226
+ #loc112 = loc("tmp32"(#loc13))
227
+ #loc113 = loc("tmp32"(#loc14))
228
+ #loc114 = loc("tmp1"(#loc15))
229
+ #loc115 = loc("tmp2"(#loc16))
230
+ #loc116 = loc("tmp3"(#loc17))
231
+ #loc117 = loc("tmp3"(#loc18))
232
+ #loc118 = loc("tmp3"(#loc19))
233
+ #loc119 = loc("tmp3"(#loc20))
234
+ #loc120 = loc("tmp3"(#loc21))
235
+ #loc121 = loc("tmp3"(#loc22))
236
+ #loc122 = loc("tmp4"(#loc23))
237
+ #loc123 = loc("tmp5"(#loc24))
238
+ #loc124 = loc("tmp6"(#loc25))
239
+ #loc125 = loc("tmp7"(#loc26))
240
+ #loc126 = loc("tmp8"(#loc27))
241
+ #loc127 = loc("tmp10"(#loc34))
242
+ #loc128 = loc("tmp10"(#loc35))
243
+ #loc129 = loc("tmp10"(#loc36))
244
+ #loc130 = loc("tmp10"(#loc37))
245
+ #loc131 = loc("tmp10"(#loc38))
246
+ #loc132 = loc("tmp10"(#loc39))
247
+ #loc133 = loc("tmp11"(#loc40))
248
+ #loc134 = loc("tmp12"(#loc41))
249
+ #loc135 = loc("tmp16"(#loc42))
250
+ #loc136 = loc("tmp14"(#loc43))
251
+ #loc137 = loc("tmp17"(#loc44))
252
+ #loc138 = loc("tmp18"(#loc45))
253
+ #loc139 = loc("tmp18"(#loc46))
254
+ #loc140 = loc("tmp18"(#loc47))
255
+ #loc141 = loc("tmp18"(#loc48))
256
+ #loc142 = loc("tmp18"(#loc49))
257
+ #loc143 = loc("tmp18"(#loc50))
258
+ #loc144 = loc("tmp19"(#loc51))
259
+ #loc145 = loc("tmp21"(#loc52))
260
+ #loc146 = loc("tmp22"(#loc53))
261
+ #loc147 = loc("tmp23"(#loc54))
262
+ #loc148 = loc("tmp25"(#loc61))
263
+ #loc149 = loc("tmp25"(#loc62))
264
+ #loc150 = loc("tmp25"(#loc63))
265
+ #loc151 = loc("tmp25"(#loc64))
266
+ #loc152 = loc("tmp25"(#loc65))
267
+ #loc153 = loc("tmp25"(#loc66))
268
+ #loc154 = loc("tmp25"(#loc67))
269
+ #loc155 = loc("tmp26"(#loc68))
270
+ #loc156 = loc("tmp29"(#loc69))
271
+ #loc157 = loc("tmp28"(#loc70))
272
+ #loc158 = loc("tmp30"(#loc71))
273
+ #loc159 = loc("tmp34"(#loc72))
274
+ #loc160 = loc("tmp35"(#loc73))
275
+ #loc161 = loc("tmp36"(#loc74))
276
+ #loc162 = loc("tmp38"(#loc81))
277
+ #loc163 = loc("tmp38"(#loc82))
278
+ #loc164 = loc("tmp38"(#loc83))
279
+ #loc165 = loc("tmp38"(#loc84))
280
+ #loc166 = loc("tmp38"(#loc85))
281
+ #loc167 = loc("tmp39"(#loc86))
282
+ #loc168 = loc("tmp40"(#loc87))
283
+ #loc169 = loc(fused[#loc135, #loc136])
284
+ #loc170 = loc(fused[#loc156, #loc157])
SpecForge-ext/cache/compiled_kernels/triton/6/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":18:0)
2
+ #loc91 = loc("in_ptr0"(#loc))
3
+ #loc92 = loc("in_ptr1"(#loc))
4
+ #loc93 = loc("in_ptr2"(#loc))
5
+ #loc94 = loc("in_ptr3"(#loc))
6
+ #loc95 = loc("out_ptr0"(#loc))
7
+ #loc96 = loc("ks0"(#loc))
8
+ #loc97 = loc("ks1"(#loc))
9
+ #loc98 = loc("ks2"(#loc))
10
+ #loc99 = loc("ks3"(#loc))
11
+ #loc100 = loc("xnumel"(#loc))
12
+ module {
13
+ tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
14
+ %cst = arith.constant dense<0> : tensor<1024xi64> loc(#loc1)
15
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xbf16> loc(#loc1)
16
+ %cst_1 = arith.constant dense<true> : tensor<1024xi1> loc(#loc1)
17
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc1)
18
+ %c-1_i64 = arith.constant -1 : i64 loc(#loc1)
19
+ %c2_i64 = arith.constant 2 : i64 loc(#loc1)
20
+ %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
21
+ %xoffset = tt.get_program_id x : i32 loc(#loc101)
22
+ %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc102)
23
+ %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc103)
24
+ %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc104)
25
+ %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc104)
26
+ %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32> loc(#loc105)
27
+ %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1024xi32> loc(#loc105)
28
+ %x0 = arith.extsi %xindex_5 : tensor<1024xi32> to tensor<1024xi64> loc(#loc106)
29
+ %x0_7 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc106)
30
+ %x0_8 = arith.remsi %x0, %x0_7 : tensor<1024xi64> loc(#loc106)
31
+ %x1 = arith.divsi %x0, %x0_7 : tensor<1024xi64> loc(#loc107)
32
+ %x1_9 = tt.splat %ks1 : i64 -> tensor<1024xi64> loc(#loc108)
33
+ %x1_10 = arith.remsi %x1, %x1_9 : tensor<1024xi64> loc(#loc108)
34
+ %tmp31 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc109)
35
+ %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc109)
36
+ %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc110)
37
+ %tmp31_13 = arith.extf %tmp31_12 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc111)
38
+ %tmp32 = tt.splat %in_ptr1 : !tt.ptr<i64> -> tensor<1024x!tt.ptr<i64>> loc(#loc112)
39
+ %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<1024x!tt.ptr<i64>>, tensor<1024xi64> loc(#loc112)
40
+ %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr<i64>> loc(#loc113)
41
+ %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114)
42
+ %tmp2 = tt.splat %tmp1 : i64 -> tensor<1024xi64> loc(#loc115)
43
+ %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<1024xi64> loc(#loc115)
44
+ %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116)
45
+ %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<1024xi64> loc(#loc117)
46
+ %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<1024xi64> loc(#loc117)
47
+ %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc118)
48
+ %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<1024xi1> loc(#loc119)
49
+ %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc120)
50
+ %tmp3_22 = arith.extf %tmp3_21 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc121)
51
+ %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr<i64>> loc(#loc122)
52
+ %tmp5 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc123)
53
+ %tmp6 = arith.addi %tmp4, %tmp5 : tensor<1024xi64> loc(#loc124)
54
+ %tmp7 = arith.cmpi slt, %tmp4, %cst : tensor<1024xi64> loc(#loc125)
55
+ %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<1024xi1>, tensor<1024xi64> loc(#loc126)
56
+ %0 = arith.cmpi sge, %tmp8, %cst : tensor<1024xi64> loc(#loc28)
57
+ %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<1024xi64> loc(#loc29)
58
+ %2 = arith.andi %0, %1 : tensor<1024xi1> loc(#loc30)
59
+ %3 = arith.xori %tmp3_20, %cst_1 : tensor<1024xi1> loc(#loc31)
60
+ %4 = arith.ori %2, %3 : tensor<1024xi1> loc(#loc32)
61
+ tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<1024xi1> loc(#loc33)
62
+ %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<1024xi64> loc(#loc127)
63
+ %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<1024xi64> loc(#loc128)
64
+ %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<1024xi64> loc(#loc129)
65
+ %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc130)
66
+ %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc130)
67
+ %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc131)
68
+ %tmp10_28 = arith.extf %tmp10_27 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc132)
69
+ %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<1024xf32> loc(#loc133)
70
+ %tmp12 = arith.subf %cst_2, %tmp11 : tensor<1024xf32> loc(#loc134)
71
+ %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<1024xi1>, tensor<1024xf32> loc(#loc169)
72
+ %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<1024xi64> loc(#loc137)
73
+ %tmp18 = arith.addi %x0_7, %x0 : tensor<1024xi64> loc(#loc138)
74
+ %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<1024xi64> loc(#loc139)
75
+ %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc140)
76
+ %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<1024xi1> loc(#loc141)
77
+ %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc142)
78
+ %tmp18_33 = arith.extf %tmp18_32 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc143)
79
+ %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr<i64>> loc(#loc144)
80
+ %tmp21 = arith.addi %tmp19, %tmp5 : tensor<1024xi64> loc(#loc145)
81
+ %tmp22 = arith.cmpi slt, %tmp19, %cst : tensor<1024xi64> loc(#loc146)
82
+ %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<1024xi1>, tensor<1024xi64> loc(#loc147)
83
+ %5 = arith.cmpi sge, %tmp23, %cst : tensor<1024xi64> loc(#loc55)
84
+ %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<1024xi64> loc(#loc56)
85
+ %7 = arith.andi %5, %6 : tensor<1024xi1> loc(#loc57)
86
+ %8 = arith.xori %tmp18_31, %cst_1 : tensor<1024xi1> loc(#loc58)
87
+ %9 = arith.ori %7, %8 : tensor<1024xi1> loc(#loc59)
88
+ tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<1024xi1> loc(#loc60)
89
+ %tmp25 = arith.addi %x0_7, %x0_8 : tensor<1024xi64> loc(#loc148)
90
+ %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<1024xi64> loc(#loc149)
91
+ %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<1024xi64> loc(#loc150)
92
+ %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<1024xi64> loc(#loc151)
93
+ %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc152)
94
+ %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc153)
95
+ %tmp25_39 = arith.extf %tmp25_38 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc154)
96
+ %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<1024xf32> loc(#loc155)
97
+ %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<1024xi1>, tensor<1024xf32> loc(#loc170)
98
+ %tmp30 = arith.addf %tmp16, %tmp29 : tensor<1024xf32> loc(#loc158)
99
+ %tmp34 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc159)
100
+ %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<1024xi64> loc(#loc159)
101
+ %tmp35 = arith.cmpi slt, %tmp32_15, %cst : tensor<1024xi64> loc(#loc160)
102
+ %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<1024xi1>, tensor<1024xi64> loc(#loc161)
103
+ %10 = arith.cmpi sge, %tmp36, %cst : tensor<1024xi64> loc(#loc75)
104
+ %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<1024xi64> loc(#loc76)
105
+ %12 = arith.andi %10, %11 : tensor<1024xi1> loc(#loc77)
106
+ %13 = arith.xori %xmask_6, %cst_1 : tensor<1024xi1> loc(#loc78)
107
+ %14 = arith.ori %12, %13 : tensor<1024xi1> loc(#loc79)
108
+ tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<1024xi1> loc(#loc80)
109
+ %tmp38 = arith.muli %x0_7, %tmp36 : tensor<1024xi64> loc(#loc162)
110
+ %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<1024xi64> loc(#loc163)
111
+ %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc164)
112
+ %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc164)
113
+ %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc165)
114
+ %tmp38_45 = arith.extf %tmp38_44 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc166)
115
+ %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<1024xf32> loc(#loc167)
116
+ %tmp40 = arith.addf %tmp30, %tmp39 : tensor<1024xf32> loc(#loc168)
117
+ %15 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc88)
118
+ %16 = tt.addptr %15, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc88)
119
+ %17 = arith.truncf %tmp40 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc89)
120
+ tt.store %16, %17, %xmask_6 : tensor<1024x!tt.ptr<bf16>> loc(#loc89)
121
+ tt.return loc(#loc90)
122
+ } loc(#loc)
123
+ } loc(#loc)
124
+ #loc1 = loc(unknown)
125
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:28)
126
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:33)
127
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:36)
128
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:23)
129
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":21:21)
130
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":22:19)
131
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:21)
132
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:28)
133
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:31)
134
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:36)
135
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:76)
136
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:31)
137
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:36)
138
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":28:18)
139
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":29:19)
140
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:41)
141
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:35)
142
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:30)
143
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:60)
144
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:53)
145
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:111)
146
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":31:35)
147
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":32:32)
148
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":33:18)
149
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":34:18)
150
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":35:32)
151
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:28)
152
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:98)
153
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:64)
154
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:108)
155
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:106)
156
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:123)
157
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:36)
158
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:58)
159
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:54)
160
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:31)
161
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:65)
162
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:123)
163
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":38:19)
164
+ #loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":39:13)
165
+ #loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":43:34)
166
+ #loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":41:34)
167
+ #loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":44:19)
168
+ #loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:37)
169
+ #loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:42)
170
+ #loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:31)
171
+ #loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:68)
172
+ #loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:60)
173
+ #loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:119)
174
+ #loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":46:36)
175
+ #loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":48:20)
176
+ #loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":49:20)
177
+ #loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":50:35)
178
+ #loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:28)
179
+ #loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:100)
180
+ #loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:65)
181
+ #loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:110)
182
+ #loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:108)
183
+ #loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:126)
184
+ #loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:37)
185
+ #loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:42)
186
+ #loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:64)
187
+ #loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:60)
188
+ #loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:31)
189
+ #loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:72)
190
+ #loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:131)
191
+ #loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":53:20)
192
+ #loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":56:35)
193
+ #loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":55:35)
194
+ #loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":57:20)
195
+ #loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":59:20)
196
+ #loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":60:20)
197
+ #loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":61:35)
198
+ #loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:28)
199
+ #loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:46)
200
+ #loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:38)
201
+ #loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:56)
202
+ #loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:54)
203
+ #loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:64)
204
+ #loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:40)
205
+ #loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:36)
206
+ #loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:31)
207
+ #loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:48)
208
+ #loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:88)
209
+ #loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":64:20)
210
+ #loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":65:20)
211
+ #loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:25)
212
+ #loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:37)
213
+ #loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:4)
214
+ #loc101 = loc("xoffset"(#loc2))
215
+ #loc102 = loc("xoffset"(#loc3))
216
+ #loc103 = loc("xindex"(#loc4))
217
+ #loc104 = loc("xindex"(#loc5))
218
+ #loc105 = loc("xmask"(#loc6))
219
+ #loc106 = loc("x0"(#loc7))
220
+ #loc107 = loc("x1"(#loc8))
221
+ #loc108 = loc("x1"(#loc9))
222
+ #loc109 = loc("tmp31"(#loc10))
223
+ #loc110 = loc("tmp31"(#loc11))
224
+ #loc111 = loc("tmp31"(#loc12))
225
+ #loc112 = loc("tmp32"(#loc13))
226
+ #loc113 = loc("tmp32"(#loc14))
227
+ #loc114 = loc("tmp1"(#loc15))
228
+ #loc115 = loc("tmp2"(#loc16))
229
+ #loc116 = loc("tmp3"(#loc17))
230
+ #loc117 = loc("tmp3"(#loc18))
231
+ #loc118 = loc("tmp3"(#loc19))
232
+ #loc119 = loc("tmp3"(#loc20))
233
+ #loc120 = loc("tmp3"(#loc21))
234
+ #loc121 = loc("tmp3"(#loc22))
235
+ #loc122 = loc("tmp4"(#loc23))
236
+ #loc123 = loc("tmp5"(#loc24))
237
+ #loc124 = loc("tmp6"(#loc25))
238
+ #loc125 = loc("tmp7"(#loc26))
239
+ #loc126 = loc("tmp8"(#loc27))
240
+ #loc127 = loc("tmp10"(#loc34))
241
+ #loc128 = loc("tmp10"(#loc35))
242
+ #loc129 = loc("tmp10"(#loc36))
243
+ #loc130 = loc("tmp10"(#loc37))
244
+ #loc131 = loc("tmp10"(#loc38))
245
+ #loc132 = loc("tmp10"(#loc39))
246
+ #loc133 = loc("tmp11"(#loc40))
247
+ #loc134 = loc("tmp12"(#loc41))
248
+ #loc135 = loc("tmp16"(#loc42))
249
+ #loc136 = loc("tmp14"(#loc43))
250
+ #loc137 = loc("tmp17"(#loc44))
251
+ #loc138 = loc("tmp18"(#loc45))
252
+ #loc139 = loc("tmp18"(#loc46))
253
+ #loc140 = loc("tmp18"(#loc47))
254
+ #loc141 = loc("tmp18"(#loc48))
255
+ #loc142 = loc("tmp18"(#loc49))
256
+ #loc143 = loc("tmp18"(#loc50))
257
+ #loc144 = loc("tmp19"(#loc51))
258
+ #loc145 = loc("tmp21"(#loc52))
259
+ #loc146 = loc("tmp22"(#loc53))
260
+ #loc147 = loc("tmp23"(#loc54))
261
+ #loc148 = loc("tmp25"(#loc61))
262
+ #loc149 = loc("tmp25"(#loc62))
263
+ #loc150 = loc("tmp25"(#loc63))
264
+ #loc151 = loc("tmp25"(#loc64))
265
+ #loc152 = loc("tmp25"(#loc65))
266
+ #loc153 = loc("tmp25"(#loc66))
267
+ #loc154 = loc("tmp25"(#loc67))
268
+ #loc155 = loc("tmp26"(#loc68))
269
+ #loc156 = loc("tmp29"(#loc69))
270
+ #loc157 = loc("tmp28"(#loc70))
271
+ #loc158 = loc("tmp30"(#loc71))
272
+ #loc159 = loc("tmp34"(#loc72))
273
+ #loc160 = loc("tmp35"(#loc73))
274
+ #loc161 = loc("tmp36"(#loc74))
275
+ #loc162 = loc("tmp38"(#loc81))
276
+ #loc163 = loc("tmp38"(#loc82))
277
+ #loc164 = loc("tmp38"(#loc83))
278
+ #loc165 = loc("tmp38"(#loc84))
279
+ #loc166 = loc("tmp38"(#loc85))
280
+ #loc167 = loc("tmp39"(#loc86))
281
+ #loc168 = loc("tmp40"(#loc87))
282
+ #loc169 = loc(fused[#loc135, #loc136])
283
+ #loc170 = loc(fused[#loc156, #loc157])
SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/__grp__triton_red_fused_zeros_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.json"}}
SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.cubin ADDED
Binary file (17.7 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "34e25b732afeb5d6a31b0c6feb7a22f95462a59f2200623b625166fff2db10d2", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"}
SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.llir ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
6
+
7
+ ; Function Attrs: nounwind
8
+ define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 {
9
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
10
+ %9 = shl i32 %8, 6, !dbg !8
11
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
12
+ %11 = and i32 %10, 126, !dbg !9
13
+ %12 = lshr exact i32 %11, 1, !dbg !9
14
+ %13 = or disjoint i32 %12, %9, !dbg !10
15
+ %14 = shl nuw nsw i32 %10, 2, !dbg !11
16
+ %15 = and i32 %14, 4, !dbg !11
17
+ %16 = sdiv i32 %13, 2048, !dbg !12
18
+ %17 = mul i32 %16, 2048, !dbg !13
19
+ %.decomposed = sub i32 %13, %17, !dbg !13
20
+ %18 = srem i32 %16, 32, !dbg !14
21
+ %19 = sdiv i32 %13, 65536, !dbg !15
22
+ %20 = shl nsw i32 %18, 7, !dbg !16
23
+ %21 = shl nsw i32 %.decomposed, 12, !dbg !17
24
+ %22 = shl i32 %19, 23, !dbg !18
25
+ %23 = shl i32 %13, 7, !dbg !19
26
+ %24 = add i32 %22, %21
27
+ %25 = add i32 %24, %20
28
+ %26 = zext nneg i32 %15 to i64, !dbg !20
29
+ %27 = sext i32 %23 to i64, !dbg !20
30
+ %invariant.gep = getelementptr bfloat, ptr addrspace(1) %1, i64 %27, !dbg !20
31
+ br label %28, !dbg !20
32
+
33
+ 28: ; preds = %7, %28
34
+ %indvars.iv = phi i64 [ 0, %7 ], [ %indvars.iv.next, %28 ]
35
+ %29 = phi float [ 0.000000e+00, %7 ], [ %70, %28 ]
36
+ %30 = phi float [ 0.000000e+00, %7 ], [ %71, %28 ]
37
+ %31 = phi float [ 0.000000e+00, %7 ], [ %72, %28 ]
38
+ %32 = phi float [ 0.000000e+00, %7 ], [ %73, %28 ]
39
+ %33 = or disjoint i64 %indvars.iv, %26, !dbg !21
40
+ %34 = trunc nuw nsw i64 %33 to i32, !dbg !22
41
+ %35 = add i32 %25, %34, !dbg !22
42
+ %36 = sext i32 %35 to i64, !dbg !23
43
+ %37 = getelementptr bfloat, ptr addrspace(1) %0, i64 %36, !dbg !23
44
+ %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !24
45
+ %39 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %37, i64 %38, i1 true) #4, !dbg !24
46
+ %40 = extractvalue { i32, i32 } %39, 0, !dbg !24
47
+ %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !24
48
+ %42 = extractvalue { i32, i32 } %39, 1, !dbg !24
49
+ %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !24
50
+ %44 = extractelement <2 x bfloat> %41, i64 0, !dbg !24
51
+ %45 = extractelement <2 x bfloat> %41, i64 1, !dbg !24
52
+ %46 = extractelement <2 x bfloat> %43, i64 0, !dbg !24
53
+ %47 = extractelement <2 x bfloat> %43, i64 1, !dbg !24
54
+ %48 = fpext bfloat %44 to float, !dbg !25
55
+ %49 = fpext bfloat %45 to float, !dbg !25
56
+ %50 = fpext bfloat %46 to float, !dbg !25
57
+ %51 = fpext bfloat %47 to float, !dbg !25
58
+ %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %33, !dbg !26
59
+ %52 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !27
60
+ %53 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %gep, i64 %52, i1 true) #4, !dbg !27
61
+ %54 = extractvalue { i32, i32 } %53, 0, !dbg !27
62
+ %55 = bitcast i32 %54 to <2 x bfloat>, !dbg !27
63
+ %56 = extractvalue { i32, i32 } %53, 1, !dbg !27
64
+ %57 = bitcast i32 %56 to <2 x bfloat>, !dbg !27
65
+ %58 = extractelement <2 x bfloat> %55, i64 0, !dbg !27
66
+ %59 = extractelement <2 x bfloat> %55, i64 1, !dbg !27
67
+ %60 = extractelement <2 x bfloat> %57, i64 0, !dbg !27
68
+ %61 = extractelement <2 x bfloat> %57, i64 1, !dbg !27
69
+ %62 = fpext bfloat %58 to float, !dbg !28
70
+ %63 = fpext bfloat %59 to float, !dbg !28
71
+ %64 = fpext bfloat %60 to float, !dbg !28
72
+ %65 = fpext bfloat %61 to float, !dbg !28
73
+ %66 = fmul float %48, %62, !dbg !29
74
+ %67 = fmul float %49, %63, !dbg !29
75
+ %68 = fmul float %50, %64, !dbg !29
76
+ %69 = fmul float %51, %65, !dbg !29
77
+ %70 = fadd float %29, %66, !dbg !30
78
+ %71 = fadd float %30, %67, !dbg !30
79
+ %72 = fadd float %31, %68, !dbg !30
80
+ %73 = fadd float %32, %69, !dbg !30
81
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8, !dbg !20
82
+ %74 = icmp samesign ult i64 %indvars.iv, 120, !dbg !20
83
+ br i1 %74, label %28, label %75, !dbg !20
84
+
85
+ 75: ; preds = %28
86
+ %76 = and i32 %10, 63, !dbg !9
87
+ %77 = or disjoint i32 %9, %76, !dbg !10
88
+ %78 = fadd float %70, %71, !dbg !31
89
+ %79 = fadd float %72, %78, !dbg !31
90
+ %80 = fadd float %73, %79, !dbg !31
91
+ %81 = bitcast float %80 to i32, !dbg !35
92
+ %82 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %81, i32 1, i32 31), !dbg !35
93
+ %83 = bitcast i32 %82 to float, !dbg !35
94
+ %84 = fadd float %80, %83, !dbg !31
95
+ %85 = shl nuw nsw i32 %11, 1, !dbg !36
96
+ %86 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %85, !dbg !36
97
+ store float %84, ptr addrspace(3) %86, align 4, !dbg !36
98
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !36
99
+ %87 = shl nuw nsw i32 %76, 2, !dbg !36
100
+ %88 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %87, !dbg !36
101
+ %89 = load i32, ptr addrspace(3) %88, align 4, !dbg !36
102
+ %90 = sext i32 %77 to i64, !dbg !37
103
+ %91 = getelementptr float, ptr addrspace(1) %2, i64 %90, !dbg !37
104
+ %92 = and i32 %10, 64, !dbg !38
105
+ %93 = icmp eq i32 %92, 0, !dbg !38
106
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %89, ptr addrspace(1) %91, i1 %93) #4, !dbg !38
107
+ ret void, !dbg !39
108
+ }
109
+
110
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
111
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
112
+
113
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
114
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
115
+
116
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
117
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
118
+
119
+ ; Function Attrs: convergent nocallback nounwind
120
+ declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
121
+
122
+ attributes #0 = { nounwind "nvvm.reqntid"="128" }
123
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
124
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
125
+ attributes #3 = { convergent nocallback nounwind }
126
+ attributes #4 = { nounwind }
127
+
128
+ !llvm.dbg.cu = !{!0}
129
+ !llvm.module.flags = !{!2, !3}
130
+
131
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
132
+ !1 = !DIFile(filename: "cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og")
133
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
134
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
135
+ !4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
136
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
137
+ !6 = !{}
138
+ !7 = !DILocation(line: 23, column: 28, scope: !4)
139
+ !8 = !DILocation(line: 23, column: 33, scope: !4)
140
+ !9 = !DILocation(line: 24, column: 44, scope: !4)
141
+ !10 = !DILocation(line: 24, column: 23, scope: !4)
142
+ !11 = !DILocation(line: 26, column: 37, scope: !4)
143
+ !12 = !DILocation(line: 29, column: 21, scope: !4)
144
+ !13 = !DILocation(line: 28, column: 19, scope: !4)
145
+ !14 = !DILocation(line: 29, column: 29, scope: !4)
146
+ !15 = !DILocation(line: 30, column: 19, scope: !4)
147
+ !16 = !DILocation(line: 39, column: 45, scope: !4)
148
+ !17 = !DILocation(line: 39, column: 55, scope: !4)
149
+ !18 = !DILocation(line: 39, column: 68, scope: !4)
150
+ !19 = !DILocation(line: 40, column: 45, scope: !4)
151
+ !20 = !DILocation(line: 33, column: 40, scope: !4)
152
+ !21 = !DILocation(line: 34, column: 31, scope: !4)
153
+ !22 = !DILocation(line: 39, column: 60, scope: !4)
154
+ !23 = !DILocation(line: 39, column: 34, scope: !4)
155
+ !24 = !DILocation(line: 39, column: 73, scope: !4)
156
+ !25 = !DILocation(line: 39, column: 127, scope: !4)
157
+ !26 = !DILocation(line: 40, column: 34, scope: !4)
158
+ !27 = !DILocation(line: 40, column: 50, scope: !4)
159
+ !28 = !DILocation(line: 40, column: 104, scope: !4)
160
+ !29 = !DILocation(line: 41, column: 22, scope: !4)
161
+ !30 = !DILocation(line: 43, column: 23, scope: !4)
162
+ !31 = !DILocation(line: 261, column: 15, scope: !32, inlinedAt: !34)
163
+ !32 = distinct !DILexicalBlockFile(scope: !4, file: !33, discriminator: 0)
164
+ !33 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language")
165
+ !34 = !DILocation(line: 45, column: 25, scope: !4)
166
+ !35 = !DILocation(line: 291, column: 36, scope: !32, inlinedAt: !34)
167
+ !36 = !DILocation(line: 45, column: 28, scope: !4)
168
+ !37 = !DILocation(line: 49, column: 25, scope: !4)
169
+ !38 = !DILocation(line: 49, column: 36, scope: !4)
170
+ !39 = !DILocation(line: 49, column: 4, scope: !4)
SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.ptx ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0
10
+ .extern .shared .align 16 .b8 global_smem[];
11
+ // @triton_red_fused_zeros_0
12
+ .visible .entry triton_red_fused_zeros_0(
13
+ .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0,
14
+ .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1,
15
+ .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2,
16
+ .param .u32 triton_red_fused_zeros_0_param_3,
17
+ .param .u32 triton_red_fused_zeros_0_param_4,
18
+ .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5,
19
+ .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6
20
+ )
21
+ .reqntid 128
22
+ {
23
+ .reg .pred %p<5>;
24
+ .reg .b16 %rs<9>;
25
+ .reg .b32 %r<79>;
26
+ .reg .b64 %rd<24>;
27
+ .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0
28
+ $L__func_begin0:
29
+ .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0
30
+
31
+ // %bb.0:
32
+ ld.param.b64 %rd8, [triton_red_fused_zeros_0_param_2];
33
+ ld.param.b64 %rd7, [triton_red_fused_zeros_0_param_0];
34
+ ld.param.b64 %rd10, [triton_red_fused_zeros_0_param_1];
35
+ $L__tmp0:
36
+ .loc 1 23 28 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:23:28
37
+ mov.u32 %r13, %ctaid.x;
38
+ .loc 1 23 33 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:23:33
39
+ shl.b32 %r1, %r13, 6;
40
+ .loc 1 24 44 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:44
41
+ mov.u32 %r2, %tid.x;
42
+ and.b32 %r3, %r2, 126;
43
+ bfe.u32 %r14, %r2, 1, 6;
44
+ .loc 1 24 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:23
45
+ or.b32 %r15, %r14, %r1;
46
+ .loc 1 26 37 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:26:37
47
+ shl.b32 %r16, %r2, 2;
48
+ and.b32 %r17, %r16, 4;
49
+ .loc 1 29 21 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:29:21
50
+ bfe.s32 %r18, %r13, 25, 1;
51
+ shr.u32 %r19, %r18, 21;
52
+ add.s32 %r20, %r15, %r19;
53
+ shr.s32 %r21, %r20, 11;
54
+ .loc 1 29 29 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:29:29
55
+ shr.u32 %r22, %r21, 27;
56
+ add.s32 %r23, %r21, %r22;
57
+ and.b32 %r24, %r23, 33554400;
58
+ sub.s32 %r25, %r21, %r24;
59
+ .loc 1 30 19 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:30:19
60
+ shr.u32 %r26, %r18, 16;
61
+ add.s32 %r27, %r15, %r26;
62
+ .loc 1 39 45 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:45
63
+ shl.b32 %r28, %r25, 7;
64
+ .loc 1 39 68 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:68
65
+ shl.b32 %r29, %r27, 7;
66
+ and.b32 %r30, %r29, -8388608;
67
+ .loc 1 33 40 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:33:40
68
+ and.b32 %r31, %r2, 1;
69
+ mul.wide.u32 %rd11, %r31, 8;
70
+ shl.b32 %r32, %r13, 13;
71
+ shl.b32 %r33, %r14, 7;
72
+ or.b32 %r34, %r32, %r33;
73
+ mul.wide.s32 %rd12, %r34, 2;
74
+ or.b64 %rd13, %rd11, %rd12;
75
+ add.s64 %rd22, %rd10, %rd13;
76
+ shl.b32 %r35, %r13, 18;
77
+ add.s32 %r36, %r30, %r35;
78
+ shl.b32 %r37, %r14, 12;
79
+ or.b32 %r38, %r36, %r37;
80
+ add.s32 %r39, %r38, %r28;
81
+ or.b32 %r40, %r39, %r17;
82
+ shl.b32 %r41, %r21, 23;
83
+ sub.s32 %r42, %r40, %r41;
84
+ cvt.u64.u32 %rd2, %r42;
85
+ mov.b32 %r75, 0f00000000;
86
+ mov.b64 %rd23, -8;
87
+ mov.b32 %r76, %r75;
88
+ mov.b32 %r77, %r75;
89
+ mov.b32 %r78, %r75;
90
+ $L__BB0_1: // =>This Inner Loop Header: Depth=1
91
+ .loc 1 39 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:34
92
+ add.s64 %rd20, %rd2, %rd23;
93
+ cvt.u32.u64 %r51, %rd20;
94
+ add.s32 %r52, %r51, 8;
95
+ mad.wide.s32 %rd15, %r52, 2, %rd7;
96
+ .loc 1 39 73 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:73
97
+ // begin inline asm
98
+ mov.u64 %rd14, 0x0;
99
+ createpolicy.fractional.L2::evict_first.b64 %rd14, 1.0;
100
+ // end inline asm
101
+ mov.b32 %r45, 0;
102
+ mov.pred %p1, -1;
103
+ // begin inline asm
104
+ mov.u32 %r43, %r45;
105
+ mov.u32 %r44, %r45;
106
+ @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r43, %r44 }, [ %rd15 + 0 ], %rd14;
107
+ // end inline asm
108
+ mov.b32 {%rs1, %rs2}, %r43;
109
+ mov.b32 {%rs3, %rs4}, %r44;
110
+ .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127
111
+ cvt.f32.bf16 %r53, %rs1;
112
+ cvt.f32.bf16 %r54, %rs2;
113
+ cvt.f32.bf16 %r55, %rs3;
114
+ cvt.f32.bf16 %r56, %rs4;
115
+ .loc 1 40 50 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:50
116
+ // begin inline asm
117
+ mov.u64 %rd17, 0x0;
118
+ createpolicy.fractional.L2::evict_first.b64 %rd17, 1.0;
119
+ // end inline asm
120
+ // begin inline asm
121
+ mov.u32 %r47, %r45;
122
+ mov.u32 %r48, %r45;
123
+ @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r47, %r48 }, [ %rd22 + 0 ], %rd17;
124
+ // end inline asm
125
+ mov.b32 {%rs5, %rs6}, %r47;
126
+ mov.b32 {%rs7, %rs8}, %r48;
127
+ .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104
128
+ cvt.f32.bf16 %r57, %rs5;
129
+ cvt.f32.bf16 %r58, %rs6;
130
+ cvt.f32.bf16 %r59, %rs7;
131
+ cvt.f32.bf16 %r60, %rs8;
132
+ .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23
133
+ fma.rn.f32 %r75, %r53, %r57, %r75;
134
+ fma.rn.f32 %r76, %r54, %r58, %r76;
135
+ fma.rn.f32 %r77, %r55, %r59, %r77;
136
+ fma.rn.f32 %r78, %r56, %r60, %r78;
137
+ .loc 1 33 40 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:33:40
138
+ add.s64 %rd23, %rd23, 8;
139
+ add.s64 %rd22, %rd22, 16;
140
+ setp.lt.u64 %p3, %rd23, 120;
141
+ @%p3 bra $L__BB0_1;
142
+ // %bb.2:
143
+ .loc 1 24 44 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:44
144
+ and.b32 %r62, %r2, 63;
145
+ .loc 1 24 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:23
146
+ or.b32 %r63, %r1, %r62;
147
+ $L__tmp1:
148
+ .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ]
149
+ add.f32 %r64, %r75, %r76;
150
+ add.f32 %r65, %r77, %r64;
151
+ add.f32 %r66, %r78, %r65;
152
+ .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ]
153
+ shfl.sync.bfly.b32 %r67, %r66, 1, 31, -1;
154
+ .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ]
155
+ add.f32 %r68, %r66, %r67;
156
+ $L__tmp2:
157
+ .loc 1 45 28 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:28
158
+ shl.b32 %r69, %r3, 1;
159
+ mov.b32 %r70, global_smem;
160
+ add.s32 %r71, %r70, %r69;
161
+ st.shared.b32 [%r71], %r68;
162
+ bar.sync 0;
163
+ shl.b32 %r72, %r62, 2;
164
+ add.s32 %r73, %r70, %r72;
165
+ ld.shared.b32 %r61, [%r73];
166
+ .loc 1 49 25 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:25
167
+ mad.wide.s32 %rd21, %r63, 4, %rd8;
168
+ .loc 1 49 36 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:36
169
+ and.b32 %r74, %r2, 64;
170
+ setp.eq.b32 %p4, %r74, 0;
171
+ // begin inline asm
172
+ @%p4 st.global.b32 [ %rd21 + 0 ], { %r61 };
173
+ // end inline asm
174
+ .loc 1 49 4 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:4
175
+ ret;
176
+ $L__tmp3:
177
+ $L__func_end0:
178
+ // -- End function
179
+ }
180
+ .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py"
181
+ .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py"
182
+ .section .debug_abbrev
183
+ {
184
+ .b8 1 // Abbreviation Code
185
+ .b8 17 // DW_TAG_compile_unit
186
+ .b8 1 // DW_CHILDREN_yes
187
+ .b8 37 // DW_AT_producer
188
+ .b8 8 // DW_FORM_string
189
+ .b8 19 // DW_AT_language
190
+ .b8 5 // DW_FORM_data2
191
+ .b8 3 // DW_AT_name
192
+ .b8 8 // DW_FORM_string
193
+ .b8 16 // DW_AT_stmt_list
194
+ .b8 6 // DW_FORM_data4
195
+ .b8 27 // DW_AT_comp_dir
196
+ .b8 8 // DW_FORM_string
197
+ .b8 0 // EOM(1)
198
+ .b8 0 // EOM(2)
199
+ .b8 2 // Abbreviation Code
200
+ .b8 46 // DW_TAG_subprogram
201
+ .b8 0 // DW_CHILDREN_no
202
+ .b8 3 // DW_AT_name
203
+ .b8 8 // DW_FORM_string
204
+ .b8 32 // DW_AT_inline
205
+ .b8 11 // DW_FORM_data1
206
+ .b8 0 // EOM(1)
207
+ .b8 0 // EOM(2)
208
+ .b8 3 // Abbreviation Code
209
+ .b8 46 // DW_TAG_subprogram
210
+ .b8 1 // DW_CHILDREN_yes
211
+ .b8 17 // DW_AT_low_pc
212
+ .b8 1 // DW_FORM_addr
213
+ .b8 18 // DW_AT_high_pc
214
+ .b8 1 // DW_FORM_addr
215
+ .b8 49 // DW_AT_abstract_origin
216
+ .b8 19 // DW_FORM_ref4
217
+ .b8 0 // EOM(1)
218
+ .b8 0 // EOM(2)
219
+ .b8 4 // Abbreviation Code
220
+ .b8 29 // DW_TAG_inlined_subroutine
221
+ .b8 0 // DW_CHILDREN_no
222
+ .b8 49 // DW_AT_abstract_origin
223
+ .b8 19 // DW_FORM_ref4
224
+ .b8 17 // DW_AT_low_pc
225
+ .b8 1 // DW_FORM_addr
226
+ .b8 18 // DW_AT_high_pc
227
+ .b8 1 // DW_FORM_addr
228
+ .b8 88 // DW_AT_call_file
229
+ .b8 11 // DW_FORM_data1
230
+ .b8 89 // DW_AT_call_line
231
+ .b8 11 // DW_FORM_data1
232
+ .b8 87 // DW_AT_call_column
233
+ .b8 11 // DW_FORM_data1
234
+ .b8 0 // EOM(1)
235
+ .b8 0 // EOM(2)
236
+ .b8 0 // EOM(3)
237
+ }
238
+ .section .debug_info
239
+ {
240
+ .b32 209 // Length of Unit
241
+ .b8 2 // DWARF version number
242
+ .b8 0
243
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
244
+ .b8 8 // Address Size (in bytes)
245
+ .b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit
246
+ .b8 116 // DW_AT_producer
247
+ .b8 114
248
+ .b8 105
249
+ .b8 116
250
+ .b8 111
251
+ .b8 110
252
+ .b8 0
253
+ .b8 2 // DW_AT_language
254
+ .b8 0
255
+ .b8 99 // DW_AT_name
256
+ .b8 111
257
+ .b8 103
258
+ .b8 111
259
+ .b8 108
260
+ .b8 53
261
+ .b8 53
262
+ .b8 99
263
+ .b8 116
264
+ .b8 104
265
+ .b8 107
266
+ .b8 52
267
+ .b8 122
268
+ .b8 101
269
+ .b8 118
270
+ .b8 115
271
+ .b8 121
272
+ .b8 51
273
+ .b8 100
274
+ .b8 108
275
+ .b8 113
276
+ .b8 105
277
+ .b8 121
278
+ .b8 122
279
+ .b8 105
280
+ .b8 112
281
+ .b8 101
282
+ .b8 102
283
+ .b8 118
284
+ .b8 55
285
+ .b8 51
286
+ .b8 53
287
+ .b8 103
288
+ .b8 101
289
+ .b8 50
290
+ .b8 119
291
+ .b8 116
292
+ .b8 97
293
+ .b8 100
294
+ .b8 100
295
+ .b8 118
296
+ .b8 107
297
+ .b8 52
298
+ .b8 51
299
+ .b8 54
300
+ .b8 113
301
+ .b8 104
302
+ .b8 116
303
+ .b8 53
304
+ .b8 110
305
+ .b8 111
306
+ .b8 120
307
+ .b8 46
308
+ .b8 112
309
+ .b8 121
310
+ .b8 0
311
+ .b32 .debug_line // DW_AT_stmt_list
312
+ .b8 47 // DW_AT_comp_dir
313
+ .b8 119
314
+ .b8 111
315
+ .b8 114
316
+ .b8 107
317
+ .b8 115
318
+ .b8 112
319
+ .b8 97
320
+ .b8 99
321
+ .b8 101
322
+ .b8 47
323
+ .b8 104
324
+ .b8 97
325
+ .b8 110
326
+ .b8 114
327
+ .b8 117
328
+ .b8 105
329
+ .b8 47
330
+ .b8 83
331
+ .b8 112
332
+ .b8 101
333
+ .b8 99
334
+ .b8 70
335
+ .b8 111
336
+ .b8 114
337
+ .b8 103
338
+ .b8 101
339
+ .b8 45
340
+ .b8 101
341
+ .b8 120
342
+ .b8 116
343
+ .b8 47
344
+ .b8 99
345
+ .b8 97
346
+ .b8 99
347
+ .b8 104
348
+ .b8 101
349
+ .b8 47
350
+ .b8 99
351
+ .b8 111
352
+ .b8 109
353
+ .b8 112
354
+ .b8 105
355
+ .b8 108
356
+ .b8 101
357
+ .b8 100
358
+ .b8 95
359
+ .b8 107
360
+ .b8 101
361
+ .b8 114
362
+ .b8 110
363
+ .b8 101
364
+ .b8 108
365
+ .b8 115
366
+ .b8 47
367
+ .b8 111
368
+ .b8 103
369
+ .b8 0
370
+ .b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram
371
+ .b8 116 // DW_AT_name
372
+ .b8 114
373
+ .b8 105
374
+ .b8 116
375
+ .b8 111
376
+ .b8 110
377
+ .b8 95
378
+ .b8 114
379
+ .b8 101
380
+ .b8 100
381
+ .b8 95
382
+ .b8 102
383
+ .b8 117
384
+ .b8 115
385
+ .b8 101
386
+ .b8 100
387
+ .b8 95
388
+ .b8 122
389
+ .b8 101
390
+ .b8 114
391
+ .b8 111
392
+ .b8 115
393
+ .b8 95
394
+ .b8 48
395
+ .b8 0
396
+ .b8 1 // DW_AT_inline
397
+ .b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram
398
+ .b64 $L__func_begin0 // DW_AT_low_pc
399
+ .b64 $L__func_end0 // DW_AT_high_pc
400
+ .b32 139 // DW_AT_abstract_origin
401
+ .b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine
402
+ .b32 139 // DW_AT_abstract_origin
403
+ .b64 $L__tmp1 // DW_AT_low_pc
404
+ .b64 $L__tmp2 // DW_AT_high_pc
405
+ .b8 1 // DW_AT_call_file
406
+ .b8 45 // DW_AT_call_line
407
+ .b8 25 // DW_AT_call_column
408
+ .b8 0 // End Of Children Mark
409
+ .b8 0 // End Of Children Mark
410
+ }
411
+ .section .debug_macinfo { }
SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.source ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0)
2
+ #loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
3
+ #loc46 = loc(unknown)
4
+ #loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
5
+ #loc53 = loc("in_ptr0"(#loc))
6
+ #loc54 = loc("in_ptr1"(#loc))
7
+ #loc55 = loc("out_ptr1"(#loc))
8
+ #loc56 = loc("xnumel"(#loc))
9
+ #loc57 = loc("r0_numel"(#loc))
10
+ #loc97 = loc("input"(#loc44))
11
+ #loc98 = loc("a"(#loc49))
12
+ #loc99 = loc("b"(#loc49))
13
+ module {
14
+ tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
15
+ %xnumel_0 = arith.constant 131072 : i32 loc(#loc58)
16
+ %r0_numel_1 = arith.constant 128 : i32 loc(#loc59)
17
+ %xoffset = tt.get_program_id x : i32 loc(#loc60)
18
+ %xoffset_2 = arith.constant 64 : i32 loc(#loc61)
19
+ %xoffset_3 = arith.constant 64 : i32 loc(#loc61)
20
+ %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61)
21
+ %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc62)
22
+ %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc63)
23
+ %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc64)
24
+ %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc64)
25
+ %xmask = arith.constant true loc(#loc65)
26
+ %xmask_8 = arith.constant dense<true> : tensor<64x8xi1> loc(#loc65)
27
+ %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc66)
28
+ %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc67)
29
+ %x0 = arith.constant 2048 : i32 loc(#loc68)
30
+ %x0_10 = arith.constant 2048 : i32 loc(#loc68)
31
+ %x0_11 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc68)
32
+ %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc68)
33
+ %x1 = arith.constant 2048 : i32 loc(#loc69)
34
+ %x1_13 = arith.constant 2048 : i32 loc(#loc69)
35
+ %x1_14 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc69)
36
+ %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc69)
37
+ %x1_16 = arith.constant 32 : i32 loc(#loc70)
38
+ %x1_17 = arith.constant 32 : i32 loc(#loc70)
39
+ %x1_18 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc70)
40
+ %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<64x1xi32> loc(#loc70)
41
+ %x2 = arith.constant 65536 : i32 loc(#loc71)
42
+ %x2_20 = arith.constant 65536 : i32 loc(#loc71)
43
+ %x2_21 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc71)
44
+ %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<64x1xi32> loc(#loc71)
45
+ %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72)
46
+ %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc72)
47
+ %c0_i32 = arith.constant 0 : i32 loc(#loc16)
48
+ %c8_i32 = arith.constant 8 : i32 loc(#loc16)
49
+ %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16)
50
+ %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16)
51
+ %2 = arith.bitcast %c8_i32 : i32 to i32 loc(#loc16)
52
+ %3 = ub.poison : i32 loc(#loc16)
53
+ %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<64x8xf32>) : i32 {
54
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc74)
55
+ %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x8xi32> loc(#loc74)
56
+ %r0_mask = arith.constant dense<128> : tensor<1x8xi32> loc(#loc75)
57
+ %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x8xi32> loc(#loc75)
58
+ %tmp0 = arith.constant 128 : i32 loc(#loc76)
59
+ %tmp0_30 = arith.constant 128 : i32 loc(#loc76)
60
+ %tmp0_31 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc76)
61
+ %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<64x1xi32> loc(#loc76)
62
+ %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc77)
63
+ %tmp0_34 = tt.broadcast %tmp0_32 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc77)
64
+ %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<64x8xi32> loc(#loc77)
65
+ %tmp0_36 = arith.constant 4096 : i32 loc(#loc78)
66
+ %tmp0_37 = arith.constant 4096 : i32 loc(#loc78)
67
+ %tmp0_38 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc78)
68
+ %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<64x1xi32> loc(#loc78)
69
+ %tmp0_40 = tt.broadcast %tmp0_39 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc79)
70
+ %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<64x8xi32> loc(#loc79)
71
+ %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80)
72
+ %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80)
73
+ %tmp0_44 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc80)
74
+ %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<64x1xi32> loc(#loc80)
75
+ %tmp0_46 = tt.broadcast %tmp0_45 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc81)
76
+ %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<64x8xi32> loc(#loc81)
77
+ %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc82)
78
+ %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc82)
79
+ %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83)
80
+ %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc83)
81
+ %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc83)
82
+ %tmp0_53 = arith.truncf %tmp0_52 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc83)
83
+ %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc83)
84
+ %tmp0_55 = arith.extf %tmp0_54 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc84)
85
+ %tmp1 = arith.constant 128 : i32 loc(#loc85)
86
+ %tmp1_56 = arith.constant 128 : i32 loc(#loc85)
87
+ %tmp1_57 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc85)
88
+ %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<64x1xi32> loc(#loc85)
89
+ %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc86)
90
+ %tmp1_60 = tt.broadcast %tmp1_58 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc86)
91
+ %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<64x8xi32> loc(#loc86)
92
+ %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc87)
93
+ %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc87)
94
+ %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88)
95
+ %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc88)
96
+ %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc88)
97
+ %tmp1_67 = arith.truncf %tmp1_66 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc88)
98
+ %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc88)
99
+ %tmp1_69 = arith.extf %tmp1_68 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc89)
100
+ %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<64x8xf32> loc(#loc90)
101
+ %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<64x8xf32> loc(#loc91)
102
+ %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc92)
103
+ %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc92)
104
+ scf.yield %_tmp4_71 : tensor<64x8xf32> loc(#loc36)
105
+ } loc(#loc73)
106
+ %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc93)
107
+ %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc94)
108
+ %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95)
109
+ %tmp8 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> loc(#loc96)
110
+ %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<64x1xf32> loc(#loc96)
111
+ %4 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc41)
112
+ %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc41)
113
+ tt.store %5, %tmp8_26 : tensor<64x1x!tt.ptr<f32>> loc(#loc42)
114
+ tt.return loc(#loc43)
115
+ } loc(#loc)
116
+ tt.func private @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x8xf32> loc("input"(#loc44))) -> tensor<64xf32> attributes {noinline = false} {
117
+ %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
118
+ ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
119
+ %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45)
120
+ tt.reduce.return %2 : f32 loc(#loc45)
121
+ }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc45)
122
+ tt.return %0 : tensor<64xf32> loc(#loc47)
123
+ ^bb1: // no predecessors
124
+ %1 = ub.poison : tensor<64xf32> loc(#loc48)
125
+ tt.return %1 : tensor<64xf32> loc(#loc48)
126
+ } loc(#loc44)
127
+ tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} {
128
+ %0 = arith.addf %a, %b : f32 loc(#loc50)
129
+ tt.return %0 : f32 loc(#loc51)
130
+ ^bb1: // no predecessors
131
+ %1 = ub.poison : f32 loc(#loc52)
132
+ tt.return %1 : f32 loc(#loc52)
133
+ } loc(#loc49)
134
+ } loc(#loc)
135
+ #loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":19:13)
136
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":20:15)
137
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28)
138
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33)
139
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:36)
140
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44)
141
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23)
142
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":25:46)
143
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:27)
144
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37)
145
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19)
146
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21)
147
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29)
148
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19)
149
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":32:43)
150
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40)
151
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31)
152
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29)
153
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45)
154
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41)
155
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55)
156
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50)
157
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68)
158
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60)
159
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34)
160
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73)
161
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127)
162
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45)
163
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41)
164
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34)
165
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50)
166
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104)
167
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22)
168
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23)
169
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40)
170
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8)
171
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25)
172
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28)
173
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":47:11)
174
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":48:18)
175
+ #loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25)
176
+ #loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36)
177
+ #loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4)
178
+ #loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
179
+ #loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
180
+ #loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
181
+ #loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
182
+ #loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
183
+ #loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
184
+ #loc58 = loc("xnumel"(#loc1))
185
+ #loc59 = loc("r0_numel"(#loc2))
186
+ #loc60 = loc("xoffset"(#loc3))
187
+ #loc61 = loc("xoffset"(#loc4))
188
+ #loc62 = loc("xindex"(#loc5))
189
+ #loc63 = loc("xindex"(#loc6))
190
+ #loc64 = loc("xindex"(#loc7))
191
+ #loc65 = loc("xmask"(#loc8))
192
+ #loc66 = loc("r0_base"(#loc9))
193
+ #loc67 = loc("r0_base"(#loc10))
194
+ #loc68 = loc("x0"(#loc11))
195
+ #loc69 = loc("x1"(#loc12))
196
+ #loc70 = loc("x1"(#loc13))
197
+ #loc71 = loc("x2"(#loc14))
198
+ #loc72 = loc("_tmp4"(#loc15))
199
+ #loc73 = loc("_tmp4"(#loc16))
200
+ #loc74 = loc("r0_index"(#loc17))
201
+ #loc75 = loc("r0_mask"(#loc18))
202
+ #loc76 = loc("tmp0"(#loc19))
203
+ #loc77 = loc("tmp0"(#loc20))
204
+ #loc78 = loc("tmp0"(#loc21))
205
+ #loc79 = loc("tmp0"(#loc22))
206
+ #loc80 = loc("tmp0"(#loc23))
207
+ #loc81 = loc("tmp0"(#loc24))
208
+ #loc82 = loc("tmp0"(#loc25))
209
+ #loc83 = loc("tmp0"(#loc26))
210
+ #loc84 = loc("tmp0"(#loc27))
211
+ #loc85 = loc("tmp1"(#loc28))
212
+ #loc86 = loc("tmp1"(#loc29))
213
+ #loc87 = loc("tmp1"(#loc30))
214
+ #loc88 = loc("tmp1"(#loc31))
215
+ #loc89 = loc("tmp1"(#loc32))
216
+ #loc90 = loc("tmp2"(#loc33))
217
+ #loc91 = loc("tmp5"(#loc34))
218
+ #loc92 = loc("_tmp4"(#loc35))
219
+ #loc93 = loc("tmp4"(#loc37))
220
+ #loc94 = loc("tmp4"(#loc38))
221
+ #loc95 = loc("tmp7"(#loc39))
222
+ #loc96 = loc("tmp8"(#loc40))
SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.ttgir ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
2
+ #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}>
3
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0)
4
+ #loc1 = loc(unknown)
5
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25)
6
+ #loc39 = loc("in_ptr0"(#loc))
7
+ #loc40 = loc("in_ptr1"(#loc))
8
+ #loc41 = loc("out_ptr1"(#loc))
9
+ #loc42 = loc("xnumel"(#loc))
10
+ #loc43 = loc("r0_numel"(#loc))
11
+ #loc73 = loc("tmp4"(#loc33))
12
+ #loc76 = loc(callsite(#loc1 at #loc73))
13
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
14
+ tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
15
+ %cst = arith.constant dense<128> : tensor<1x8xi32, #blocked> loc(#loc1)
16
+ %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1)
17
+ %cst_1 = arith.constant dense<4096> : tensor<64x1xi32, #blocked> loc(#loc1)
18
+ %cst_2 = arith.constant dense<8388608> : tensor<64x1xi32, #blocked> loc(#loc1)
19
+ %cst_3 = arith.constant dense<65536> : tensor<64x1xi32, #blocked> loc(#loc1)
20
+ %cst_4 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1)
21
+ %cst_5 = arith.constant dense<2048> : tensor<64x1xi32, #blocked> loc(#loc1)
22
+ %c64_i32 = arith.constant 64 : i32 loc(#loc1)
23
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked> loc(#loc1)
24
+ %c8_i32 = arith.constant 8 : i32 loc(#loc1)
25
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
26
+ %c0_i32 = arith.constant 0 : i32 loc(#loc1)
27
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> loc(#loc1)
28
+ %xoffset = tt.get_program_id x : i32 loc(#loc44)
29
+ %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc45)
30
+ %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc46)
31
+ %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc46)
32
+ %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc46)
33
+ %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc46)
34
+ %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc47)
35
+ %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc47)
36
+ %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc47)
37
+ %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc47)
38
+ %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc48)
39
+ %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x8xi32, #blocked> loc(#loc48)
40
+ %x0 = arith.remsi %xindex_14, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc49)
41
+ %x1 = arith.divsi %xindex_14, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc50)
42
+ %x1_17 = arith.remsi %x1, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc51)
43
+ %x2 = arith.divsi %xindex_14, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc52)
44
+ %tmp0 = arith.muli %x1_17, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc53)
45
+ %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc54)
46
+ %tmp0_19 = arith.muli %x0, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc55)
47
+ %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc56)
48
+ %tmp0_21 = arith.muli %x2, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc57)
49
+ %tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc58)
50
+ %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc59)
51
+ %tmp1 = arith.muli %xindex_14, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc60)
52
+ %tmp1_24 = tt.broadcast %tmp1 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc61)
53
+ %tmp1_25 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc62)
54
+ %_tmp4 = scf.for %_tmp4_28 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_7) -> (tensor<64x8xf32, #blocked>) : i32 {
55
+ %r0_index = tt.splat %_tmp4_28 : i32 -> tensor<1x8xi32, #blocked> loc(#loc64)
56
+ %r0_index_29 = arith.addi %r0_index, %r0_base_16 : tensor<1x8xi32, #blocked> loc(#loc64)
57
+ %r0_mask = arith.cmpi slt, %r0_index_29, %cst : tensor<1x8xi32, #blocked> loc(#loc65)
58
+ %tmp0_30 = tt.broadcast %r0_index_29 : tensor<1x8xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc54)
59
+ %tmp0_31 = arith.addi %tmp0_30, %tmp0_18 : tensor<64x8xi32, #blocked> loc(#loc54)
60
+ %tmp0_32 = arith.addi %tmp0_31, %tmp0_20 : tensor<64x8xi32, #blocked> loc(#loc56)
61
+ %tmp0_33 = arith.addi %tmp0_32, %tmp0_22 : tensor<64x8xi32, #blocked> loc(#loc58)
62
+ %tmp0_34 = tt.addptr %tmp0_23, %tmp0_33 : tensor<64x8x!tt.ptr<bf16>, #blocked>, tensor<64x8xi32, #blocked> loc(#loc59)
63
+ %tmp0_35 = tt.broadcast %r0_mask : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc66)
64
+ %tmp0_36 = tt.load %tmp0_34, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc66)
65
+ %tmp0_37 = arith.extf %tmp0_36 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc67)
66
+ %tmp1_38 = arith.addi %tmp0_30, %tmp1_24 : tensor<64x8xi32, #blocked> loc(#loc61)
67
+ %tmp1_39 = tt.addptr %tmp1_25, %tmp1_38 : tensor<64x8x!tt.ptr<bf16>, #blocked>, tensor<64x8xi32, #blocked> loc(#loc62)
68
+ %tmp1_40 = tt.load %tmp1_39, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc68)
69
+ %tmp1_41 = arith.extf %tmp1_40 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc69)
70
+ %tmp2 = arith.mulf %tmp0_37, %tmp1_41 : tensor<64x8xf32, #blocked> loc(#loc70)
71
+ %tmp5 = arith.addf %arg6, %tmp2 : tensor<64x8xf32, #blocked> loc(#loc71)
72
+ %_tmp4_42 = arith.select %tmp0_35, %tmp5, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc72)
73
+ scf.yield %_tmp4_42 : tensor<64x8xf32, #blocked> loc(#loc31)
74
+ } loc(#loc63)
75
+ %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
76
+ ^bb0(%tmp4_28: f32 loc(callsite(#loc1 at #loc73)), %tmp4_29: f32 loc(callsite(#loc1 at #loc73))):
77
+ %tmp4_30 = arith.addf %tmp4_28, %tmp4_29 : f32 loc(#loc77)
78
+ tt.reduce.return %tmp4_30 : f32 loc(#loc75)
79
+ }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc75)
80
+ %tmp4_26 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc74)
81
+ %tmp4_27 = tt.expand_dims %tmp4_26 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc74)
82
+ %0 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc36)
83
+ %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr<f32>, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc36)
84
+ tt.store %1, %tmp4_27 : tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc37)
85
+ tt.return loc(#loc38)
86
+ } loc(#loc)
87
+ } loc(#loc)
88
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28)
89
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33)
90
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44)
91
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23)
92
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37)
93
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19)
94
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21)
95
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29)
96
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19)
97
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45)
98
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41)
99
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55)
100
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50)
101
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68)
102
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60)
103
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34)
104
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45)
105
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41)
106
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34)
107
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40)
108
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31)
109
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29)
110
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73)
111
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127)
112
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50)
113
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104)
114
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22)
115
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23)
116
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40)
117
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8)
118
+ #loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
119
+ #loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
120
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28)
121
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25)
122
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36)
123
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4)
124
+ #loc44 = loc("xoffset"(#loc2))
125
+ #loc45 = loc("xoffset"(#loc3))
126
+ #loc46 = loc("xindex"(#loc4))
127
+ #loc47 = loc("xindex"(#loc5))
128
+ #loc48 = loc("r0_base"(#loc6))
129
+ #loc49 = loc("x0"(#loc7))
130
+ #loc50 = loc("x1"(#loc8))
131
+ #loc51 = loc("x1"(#loc9))
132
+ #loc52 = loc("x2"(#loc10))
133
+ #loc53 = loc("tmp0"(#loc11))
134
+ #loc54 = loc("tmp0"(#loc12))
135
+ #loc55 = loc("tmp0"(#loc13))
136
+ #loc56 = loc("tmp0"(#loc14))
137
+ #loc57 = loc("tmp0"(#loc15))
138
+ #loc58 = loc("tmp0"(#loc16))
139
+ #loc59 = loc("tmp0"(#loc17))
140
+ #loc60 = loc("tmp1"(#loc18))
141
+ #loc61 = loc("tmp1"(#loc19))
142
+ #loc62 = loc("tmp1"(#loc20))
143
+ #loc63 = loc("_tmp4"(#loc21))
144
+ #loc64 = loc("r0_index"(#loc22))
145
+ #loc65 = loc("r0_mask"(#loc23))
146
+ #loc66 = loc("tmp0"(#loc24))
147
+ #loc67 = loc("tmp0"(#loc25))
148
+ #loc68 = loc("tmp1"(#loc26))
149
+ #loc69 = loc("tmp1"(#loc27))
150
+ #loc70 = loc("tmp2"(#loc28))
151
+ #loc71 = loc("tmp5"(#loc29))
152
+ #loc72 = loc("_tmp4"(#loc30))
153
+ #loc74 = loc("tmp4"(#loc35))
154
+ #loc75 = loc(callsite(#loc32 at #loc73))
155
+ #loc77 = loc(callsite(#loc34 at #loc75))
SpecForge-ext/cache/compiled_kernels/triton/6/GTRFW4ZK7225NIY3BRX6W6RC7FKGFJM7EIAGEO3CKFTP74W3CDJA/triton_red_fused_zeros_0.ttir ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25)
4
+ #loc41 = loc("in_ptr0"(#loc))
5
+ #loc42 = loc("in_ptr1"(#loc))
6
+ #loc43 = loc("out_ptr1"(#loc))
7
+ #loc44 = loc("xnumel"(#loc))
8
+ #loc45 = loc("r0_numel"(#loc))
9
+ #loc77 = loc("tmp4"(#loc35))
10
+ #loc80 = loc(callsite(#loc1 at #loc77))
11
+ module {
12
+ tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
13
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16> loc(#loc1)
14
+ %c8_i32 = arith.constant 8 : i32 loc(#loc2)
15
+ %c128_i32 = arith.constant 128 : i32 loc(#loc2)
16
+ %c0_i32 = arith.constant 0 : i32 loc(#loc2)
17
+ %cst_0 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc1)
18
+ %cst_1 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc1)
19
+ %cst_2 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
20
+ %cst_3 = arith.constant dense<128> : tensor<1x8xi32> loc(#loc1)
21
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc1)
22
+ %x2 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc46)
23
+ %x1 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc47)
24
+ %cst_5 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc1)
25
+ %c64_i32 = arith.constant 64 : i32 loc(#loc1)
26
+ %xoffset = tt.get_program_id x : i32 loc(#loc48)
27
+ %xoffset_6 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc49)
28
+ %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50)
29
+ %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51)
30
+ %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<64x1xi32> loc(#loc52)
31
+ %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<64x1xi32> loc(#loc52)
32
+ %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc53)
33
+ %r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc54)
34
+ %x0 = arith.remsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc55)
35
+ %x1_11 = arith.divsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc56)
36
+ %x1_12 = arith.remsi %x1_11, %x1 : tensor<64x1xi32> loc(#loc47)
37
+ %x2_13 = arith.divsi %xindex_9, %x2 : tensor<64x1xi32> loc(#loc46)
38
+ %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%_tmp4_15 = %cst_4) -> (tensor<64x8xf32>) : i32 {
39
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc58)
40
+ %r0_index_16 = arith.addi %r0_index, %r0_base_10 : tensor<1x8xi32> loc(#loc58)
41
+ %r0_mask = arith.cmpi slt, %r0_index_16, %cst_3 : tensor<1x8xi32> loc(#loc59)
42
+ %tmp0 = arith.muli %x1_12, %cst_2 : tensor<64x1xi32> loc(#loc60)
43
+ %tmp0_17 = tt.broadcast %r0_index_16 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc61)
44
+ %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc61)
45
+ %tmp0_19 = arith.addi %tmp0_17, %tmp0_18 : tensor<64x8xi32> loc(#loc61)
46
+ %tmp0_20 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc62)
47
+ %tmp0_21 = tt.broadcast %tmp0_20 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc63)
48
+ %tmp0_22 = arith.addi %tmp0_19, %tmp0_21 : tensor<64x8xi32> loc(#loc63)
49
+ %tmp0_23 = arith.muli %x2_13, %cst_0 : tensor<64x1xi32> loc(#loc64)
50
+ %tmp0_24 = tt.broadcast %tmp0_23 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc65)
51
+ %tmp0_25 = arith.addi %tmp0_22, %tmp0_24 : tensor<64x8xi32> loc(#loc65)
52
+ %tmp0_26 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc66)
53
+ %tmp0_27 = tt.addptr %tmp0_26, %tmp0_25 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc66)
54
+ %tmp0_28 = tt.broadcast %r0_mask : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc67)
55
+ %tmp0_29 = tt.load %tmp0_27, %tmp0_28, %cst evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc67)
56
+ %tmp0_30 = arith.extf %tmp0_29 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc68)
57
+ %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<64x1xi32> loc(#loc69)
58
+ %tmp1_31 = tt.broadcast %tmp1 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc70)
59
+ %tmp1_32 = arith.addi %tmp0_17, %tmp1_31 : tensor<64x8xi32> loc(#loc70)
60
+ %tmp1_33 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc71)
61
+ %tmp1_34 = tt.addptr %tmp1_33, %tmp1_32 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc71)
62
+ %tmp1_35 = tt.load %tmp1_34, %tmp0_28, %cst evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc72)
63
+ %tmp1_36 = arith.extf %tmp1_35 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc73)
64
+ %tmp2 = arith.mulf %tmp0_30, %tmp1_36 : tensor<64x8xf32> loc(#loc74)
65
+ %tmp5 = arith.addf %_tmp4_15, %tmp2 : tensor<64x8xf32> loc(#loc75)
66
+ %_tmp4_37 = arith.select %tmp0_28, %tmp5, %_tmp4_15 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc76)
67
+ scf.yield %_tmp4_37 : tensor<64x8xf32> loc(#loc33)
68
+ } loc(#loc57)
69
+ %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
70
+ ^bb0(%tmp4_15: f32 loc(callsite(#loc1 at #loc77)), %tmp4_16: f32 loc(callsite(#loc1 at #loc77))):
71
+ %tmp4_17 = arith.addf %tmp4_15, %tmp4_16 : f32 loc(#loc81)
72
+ tt.reduce.return %tmp4_17 : f32 loc(#loc79)
73
+ }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc79)
74
+ %tmp4_14 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc78)
75
+ %0 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc38)
76
+ %1 = tt.addptr %0, %xindex_9 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc38)
77
+ tt.store %1, %tmp4_14 : tensor<64x1x!tt.ptr<f32>> loc(#loc39)
78
+ tt.return loc(#loc40)
79
+ } loc(#loc)
80
+ } loc(#loc)
81
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40)
82
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19)
83
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29)
84
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28)
85
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33)
86
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:36)
87
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44)
88
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23)
89
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:27)
90
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37)
91
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19)
92
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21)
93
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31)
94
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29)
95
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45)
96
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41)
97
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55)
98
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50)
99
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68)
100
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60)
101
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34)
102
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73)
103
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127)
104
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45)
105
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41)
106
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34)
107
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50)
108
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104)
109
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22)
110
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23)
111
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40)
112
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8)
113
+ #loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
114
+ #loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
115
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28)
116
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25)
117
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36)
118
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4)
119
+ #loc46 = loc("x2"(#loc3))
120
+ #loc47 = loc("x1"(#loc4))
121
+ #loc48 = loc("xoffset"(#loc5))
122
+ #loc49 = loc("xoffset"(#loc6))
123
+ #loc50 = loc("xindex"(#loc7))
124
+ #loc51 = loc("xindex"(#loc8))
125
+ #loc52 = loc("xindex"(#loc9))
126
+ #loc53 = loc("r0_base"(#loc10))
127
+ #loc54 = loc("r0_base"(#loc11))
128
+ #loc55 = loc("x0"(#loc12))
129
+ #loc56 = loc("x1"(#loc13))
130
+ #loc57 = loc("_tmp4"(#loc2))
131
+ #loc58 = loc("r0_index"(#loc14))
132
+ #loc59 = loc("r0_mask"(#loc15))
133
+ #loc60 = loc("tmp0"(#loc16))
134
+ #loc61 = loc("tmp0"(#loc17))
135
+ #loc62 = loc("tmp0"(#loc18))
136
+ #loc63 = loc("tmp0"(#loc19))
137
+ #loc64 = loc("tmp0"(#loc20))
138
+ #loc65 = loc("tmp0"(#loc21))
139
+ #loc66 = loc("tmp0"(#loc22))
140
+ #loc67 = loc("tmp0"(#loc23))
141
+ #loc68 = loc("tmp0"(#loc24))
142
+ #loc69 = loc("tmp1"(#loc25))
143
+ #loc70 = loc("tmp1"(#loc26))
144
+ #loc71 = loc("tmp1"(#loc27))
145
+ #loc72 = loc("tmp1"(#loc28))
146
+ #loc73 = loc("tmp1"(#loc29))
147
+ #loc74 = loc("tmp2"(#loc30))
148
+ #loc75 = loc("tmp5"(#loc31))
149
+ #loc76 = loc("_tmp4"(#loc32))
150
+ #loc78 = loc("tmp4"(#loc37))
151
+ #loc79 = loc(callsite(#loc34 at #loc77))
152
+ #loc81 = loc(callsite(#loc36 at #loc79))
SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.cubin ADDED
Binary file (27.5 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/__grp__triton_poi_fused__to_copy_6.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_poi_fused__to_copy_6.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.source", "triton_poi_fused__to_copy_6.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttir", "triton_poi_fused__to_copy_6.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttgir", "triton_poi_fused__to_copy_6.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.llir", "triton_poi_fused__to_copy_6.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ptx", "triton_poi_fused__to_copy_6.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.cubin", "triton_poi_fused__to_copy_6.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.json"}}
SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.cubin ADDED
Binary file (14.2 kB). View file
 
SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "42bb19ed265ccbd7b1943ef4d4583a35db4f47c232332144f4da3ddc335af286", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__to_copy_6"}
SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.llir ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ ; Function Attrs: nounwind
6
+ define ptx_kernel void @triton_poi_fused__to_copy_6(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i64 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 {
7
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
8
+ %10 = shl i32 %9, 7, !dbg !8
9
+ %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
10
+ %12 = and i32 %11, 127, !dbg !9
11
+ %13 = or disjoint i32 %10, %12, !dbg !10
12
+ %14 = icmp slt i32 %13, %5, !dbg !11
13
+ %15 = sext i32 %13 to i64, !dbg !12
14
+ %.frozen = freeze i64 %2, !dbg !13
15
+ %16 = sdiv i64 %15, %.frozen, !dbg !13
16
+ %17 = mul i64 %16, %.frozen, !dbg !12
17
+ %.decomposed = sub i64 %15, %17, !dbg !12
18
+ %18 = srem i64 %16, %3, !dbg !14
19
+ %19 = sdiv i64 %15, %4, !dbg !15
20
+ %20 = insertelement <2 x i64> poison, i64 %3, i64 0, !dbg !16
21
+ %21 = insertelement <2 x i64> %20, i64 %2, i64 1, !dbg !16
22
+ %22 = icmp slt <2 x i64> %21, splat (i64 2), !dbg !16
23
+ %23 = icmp sgt <2 x i64> %21, splat (i64 1), !dbg !17
24
+ %24 = select <2 x i1> %23, <2 x i64> %21, <2 x i64> zeroinitializer, !dbg !18
25
+ %25 = zext <2 x i1> %22 to <2 x i64>, !dbg !19
26
+ %26 = add <2 x i64> %24, %25, !dbg !20
27
+ %27 = extractelement <2 x i64> %26, i64 0, !dbg !21
28
+ %28 = mul i64 %.decomposed, %27, !dbg !22
29
+ %29 = extractelement <2 x i64> %26, i64 1, !dbg !21
30
+ %30 = mul i64 %27, %29, !dbg !21
31
+ %31 = mul i64 %30, %19, !dbg !23
32
+ %32 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !24
33
+ %33 = getelementptr i64, ptr addrspace(1) %32, i64 %28, !dbg !24
34
+ %34 = getelementptr i64, ptr addrspace(1) %33, i64 %31, !dbg !24
35
+ %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !25
36
+ %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %34, i64 %35, i1 %14) #2, !dbg !25
37
+ %37 = trunc i64 %36 to i32, !dbg !26
38
+ %38 = mul i64 %18, %29, !dbg !27
39
+ %39 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !28
40
+ %40 = getelementptr i32, ptr addrspace(1) %39, i64 %38, !dbg !28
41
+ %41 = getelementptr i32, ptr addrspace(1) %40, i64 %31, !dbg !28
42
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %37, ptr addrspace(1) %41, i1 %14) #2, !dbg !29
43
+ ret void, !dbg !30
44
+ }
45
+
46
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
47
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
48
+
49
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
50
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
51
+
52
+ attributes #0 = { nounwind "nvvm.reqntid"="128" }
53
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
54
+ attributes #2 = { nounwind }
55
+
56
+ !llvm.dbg.cu = !{!0}
57
+ !llvm.module.flags = !{!2, !3}
58
+
59
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
60
+ !1 = !DIFile(filename: "c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35")
61
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
62
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
63
+ !4 = distinct !DISubprogram(name: "triton_poi_fused__to_copy_6", linkageName: "triton_poi_fused__to_copy_6", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
64
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
65
+ !6 = !{}
66
+ !7 = !DILocation(line: 19, column: 28, scope: !4)
67
+ !8 = !DILocation(line: 19, column: 33, scope: !4)
68
+ !9 = !DILocation(line: 20, column: 36, scope: !4)
69
+ !10 = !DILocation(line: 20, column: 23, scope: !4)
70
+ !11 = !DILocation(line: 21, column: 21, scope: !4)
71
+ !12 = !DILocation(line: 22, column: 19, scope: !4)
72
+ !13 = !DILocation(line: 23, column: 21, scope: !4)
73
+ !14 = !DILocation(line: 23, column: 28, scope: !4)
74
+ !15 = !DILocation(line: 24, column: 19, scope: !4)
75
+ !16 = !DILocation(line: 25, column: 54, scope: !4)
76
+ !17 = !DILocation(line: 25, column: 80, scope: !4)
77
+ !18 = !DILocation(line: 25, column: 71, scope: !4)
78
+ !19 = !DILocation(line: 25, scope: !4)
79
+ !20 = !DILocation(line: 25, column: 62, scope: !4)
80
+ !21 = !DILocation(line: 25, column: 91, scope: !4)
81
+ !22 = !DILocation(line: 25, column: 39, scope: !4)
82
+ !23 = !DILocation(line: 25, column: 138, scope: !4)
83
+ !24 = !DILocation(line: 25, column: 30, scope: !4)
84
+ !25 = !DILocation(line: 25, column: 186, scope: !4)
85
+ !26 = !DILocation(line: 26, column: 19, scope: !4)
86
+ !27 = !DILocation(line: 27, column: 34, scope: !4)
87
+ !28 = !DILocation(line: 27, column: 25, scope: !4)
88
+ !29 = !DILocation(line: 27, column: 187, scope: !4)
89
+ !30 = !DILocation(line: 27, column: 4, scope: !4)
SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ptx ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_poi_fused__to_copy_6 // -- Begin function triton_poi_fused__to_copy_6
10
+ // @triton_poi_fused__to_copy_6
11
+ .visible .entry triton_poi_fused__to_copy_6(
12
+ .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_0,
13
+ .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_1,
14
+ .param .u64 triton_poi_fused__to_copy_6_param_2,
15
+ .param .u64 triton_poi_fused__to_copy_6_param_3,
16
+ .param .u64 triton_poi_fused__to_copy_6_param_4,
17
+ .param .u32 triton_poi_fused__to_copy_6_param_5,
18
+ .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_6,
19
+ .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_7
20
+ )
21
+ .reqntid 128
22
+ {
23
+ .reg .pred %p<10>;
24
+ .reg .b32 %r<20>;
25
+ .reg .b64 %rd<54>;
26
+ .loc 1 18 0 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:18:0
27
+ $L__func_begin0:
28
+ .loc 1 18 0 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:18:0
29
+
30
+ // %bb.0:
31
+ ld.param.b64 %rd16, [triton_poi_fused__to_copy_6_param_3];
32
+ ld.param.b64 %rd15, [triton_poi_fused__to_copy_6_param_2];
33
+ $L__tmp0:
34
+ .loc 1 19 28 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:19:28
35
+ mov.u32 %r2, %ctaid.x;
36
+ .loc 1 19 33 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:19:33
37
+ shl.b32 %r3, %r2, 7;
38
+ .loc 1 20 36 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:20:36
39
+ mov.u32 %r4, %tid.x;
40
+ and.b32 %r5, %r4, 127;
41
+ .loc 1 20 23 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:20:23
42
+ or.b32 %r6, %r3, %r5;
43
+ .loc 1 22 19 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:22:19
44
+ cvt.s64.s32 %rd1, %r6;
45
+ .loc 1 23 21 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:23:21
46
+ or.b64 %rd19, %rd1, %rd15;
47
+ and.b64 %rd20, %rd19, -4294967296;
48
+ setp.ne.b64 %p1, %rd20, 0;
49
+ cvt.u32.u64 %r19, %rd1;
50
+ @%p1 bra $L__BB0_2;
51
+ bra.uni $L__BB0_1;
52
+ $L__BB0_2:
53
+ div.s64 %rd51, %rd1, %rd15;
54
+ bra.uni $L__BB0_3;
55
+ $L__BB0_1:
56
+ cvt.u32.u64 %r7, %rd15;
57
+ div.u32 %r9, %r19, %r7;
58
+ cvt.u64.u32 %rd51, %r9;
59
+ $L__BB0_3:
60
+ .loc 1 0 21 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:0:21
61
+ ld.param.b64 %rd17, [triton_poi_fused__to_copy_6_param_4];
62
+ .loc 1 22 19 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:22:19
63
+ mul.lo.s64 %rd21, %rd51, %rd15;
64
+ .loc 1 23 28 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:23:28
65
+ or.b64 %rd22, %rd51, %rd16;
66
+ and.b64 %rd23, %rd22, -4294967296;
67
+ setp.ne.b64 %p2, %rd23, 0;
68
+ @%p2 bra $L__BB0_5;
69
+ bra.uni $L__BB0_4;
70
+ $L__BB0_5:
71
+ rem.s64 %rd52, %rd51, %rd16;
72
+ bra.uni $L__BB0_6;
73
+ $L__BB0_4:
74
+ cvt.u32.u64 %r10, %rd16;
75
+ cvt.u32.u64 %r11, %rd51;
76
+ rem.u32 %r12, %r11, %r10;
77
+ cvt.u64.u32 %rd52, %r12;
78
+ $L__BB0_6:
79
+ .loc 1 0 28 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:0:28
80
+ ld.param.b32 %r1, [triton_poi_fused__to_copy_6_param_5];
81
+ ld.param.b64 %rd14, [triton_poi_fused__to_copy_6_param_1];
82
+ ld.param.b64 %rd13, [triton_poi_fused__to_copy_6_param_0];
83
+ sub.s64 %rd6, %rd1, %rd21;
84
+ .loc 1 24 19 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:24:19
85
+ or.b64 %rd24, %rd1, %rd17;
86
+ and.b64 %rd25, %rd24, -4294967296;
87
+ setp.ne.b64 %p3, %rd25, 0;
88
+ @%p3 bra $L__BB0_8;
89
+ bra.uni $L__BB0_7;
90
+ $L__BB0_8:
91
+ div.s64 %rd53, %rd1, %rd17;
92
+ bra.uni $L__BB0_9;
93
+ $L__BB0_7:
94
+ cvt.u32.u64 %r13, %rd17;
95
+ div.u32 %r15, %r19, %r13;
96
+ cvt.u64.u32 %rd53, %r15;
97
+ $L__BB0_9:
98
+ .loc 1 21 21 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:21:21
99
+ setp.lt.s32 %p4, %r19, %r1;
100
+ .loc 1 25 54 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:54
101
+ setp.lt.s64 %p6, %rd15, 2;
102
+ setp.lt.s64 %p7, %rd16, 2;
103
+ .loc 1 25 80 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:80
104
+ setp.gt.s64 %p8, %rd15, 1;
105
+ setp.gt.s64 %p9, %rd16, 1;
106
+ .loc 1 25 71 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:71
107
+ selp.b64 %rd31, %rd16, 0, %p9;
108
+ selp.b64 %rd32, %rd15, 0, %p8;
109
+ .loc 1 25 0 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25
110
+ selp.b64 %rd33, 1, 0, %p7;
111
+ selp.b64 %rd34, 1, 0, %p6;
112
+ .loc 1 25 62 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:62
113
+ add.s64 %rd35, %rd32, %rd34;
114
+ add.s64 %rd36, %rd31, %rd33;
115
+ .loc 1 25 39 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:39
116
+ mul.lo.s64 %rd37, %rd6, %rd36;
117
+ .loc 1 25 91 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:91
118
+ mul.lo.s64 %rd38, %rd36, %rd35;
119
+ .loc 1 25 138 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:138
120
+ mul.lo.s64 %rd39, %rd38, %rd53;
121
+ .loc 1 25 30 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:30
122
+ shl.b64 %rd40, %rd52, 3;
123
+ add.s64 %rd41, %rd13, %rd40;
124
+ shl.b64 %rd42, %rd37, 3;
125
+ add.s64 %rd43, %rd41, %rd42;
126
+ shl.b64 %rd44, %rd39, 3;
127
+ add.s64 %rd28, %rd43, %rd44;
128
+ .loc 1 25 186 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:186
129
+ // begin inline asm
130
+ mov.u64 %rd29, 0x0;
131
+ createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0;
132
+ // end inline asm
133
+ // begin inline asm
134
+ mov.u64 %rd27, 0x0;
135
+ @%p4 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd27 }, [ %rd28 + 0 ], %rd29;
136
+ // end inline asm
137
+ .loc 1 26 19 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:26:19
138
+ cvt.u32.u64 %r16, %rd27;
139
+ .loc 1 27 34 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:27:34
140
+ mul.lo.s64 %rd45, %rd52, %rd35;
141
+ .loc 1 27 25 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:27:25
142
+ shl.b64 %rd46, %rd6, 2;
143
+ add.s64 %rd47, %rd14, %rd46;
144
+ shl.b64 %rd48, %rd45, 2;
145
+ add.s64 %rd49, %rd47, %rd48;
146
+ shl.b64 %rd50, %rd39, 2;
147
+ add.s64 %rd30, %rd49, %rd50;
148
+ .loc 1 27 187 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:27:187
149
+ // begin inline asm
150
+ @%p4 st.global.b32 [ %rd30 + 0 ], { %r16 };
151
+ // end inline asm
152
+ .loc 1 27 4 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:27:4
153
+ ret;
154
+ $L__tmp1:
155
+ $L__func_end0:
156
+ // -- End function
157
+ }
158
+ .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py"
159
+ .section .debug_abbrev
160
+ {
161
+ .b8 1 // Abbreviation Code
162
+ .b8 17 // DW_TAG_compile_unit
163
+ .b8 0 // DW_CHILDREN_no
164
+ .b8 37 // DW_AT_producer
165
+ .b8 8 // DW_FORM_string
166
+ .b8 19 // DW_AT_language
167
+ .b8 5 // DW_FORM_data2
168
+ .b8 3 // DW_AT_name
169
+ .b8 8 // DW_FORM_string
170
+ .b8 16 // DW_AT_stmt_list
171
+ .b8 6 // DW_FORM_data4
172
+ .b8 27 // DW_AT_comp_dir
173
+ .b8 8 // DW_FORM_string
174
+ .b8 0 // EOM(1)
175
+ .b8 0 // EOM(2)
176
+ .b8 0 // EOM(3)
177
+ }
178
+ .section .debug_info
179
+ {
180
+ .b32 135 // Length of Unit
181
+ .b8 2 // DWARF version number
182
+ .b8 0
183
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
184
+ .b8 8 // Address Size (in bytes)
185
+ .b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit
186
+ .b8 116 // DW_AT_producer
187
+ .b8 114
188
+ .b8 105
189
+ .b8 116
190
+ .b8 111
191
+ .b8 110
192
+ .b8 0
193
+ .b8 2 // DW_AT_language
194
+ .b8 0
195
+ .b8 99 // DW_AT_name
196
+ .b8 51
197
+ .b8 53
198
+ .b8 104
199
+ .b8 117
200
+ .b8 113
201
+ .b8 112
202
+ .b8 54
203
+ .b8 110
204
+ .b8 103
205
+ .b8 122
206
+ .b8 104
207
+ .b8 54
208
+ .b8 55
209
+ .b8 107
210
+ .b8 116
211
+ .b8 51
212
+ .b8 50
213
+ .b8 107
214
+ .b8 117
215
+ .b8 120
216
+ .b8 111
217
+ .b8 113
218
+ .b8 112
219
+ .b8 103
220
+ .b8 104
221
+ .b8 99
222
+ .b8 51
223
+ .b8 50
224
+ .b8 102
225
+ .b8 115
226
+ .b8 116
227
+ .b8 118
228
+ .b8 52
229
+ .b8 122
230
+ .b8 111
231
+ .b8 103
232
+ .b8 99
233
+ .b8 111
234
+ .b8 117
235
+ .b8 122
236
+ .b8 97
237
+ .b8 98
238
+ .b8 100
239
+ .b8 120
240
+ .b8 120
241
+ .b8 119
242
+ .b8 116
243
+ .b8 97
244
+ .b8 51
245
+ .b8 115
246
+ .b8 108
247
+ .b8 46
248
+ .b8 112
249
+ .b8 121
250
+ .b8 0
251
+ .b32 .debug_line // DW_AT_stmt_list
252
+ .b8 47 // DW_AT_comp_dir
253
+ .b8 119
254
+ .b8 111
255
+ .b8 114
256
+ .b8 107
257
+ .b8 115
258
+ .b8 112
259
+ .b8 97
260
+ .b8 99
261
+ .b8 101
262
+ .b8 47
263
+ .b8 104
264
+ .b8 97
265
+ .b8 110
266
+ .b8 114
267
+ .b8 117
268
+ .b8 105
269
+ .b8 47
270
+ .b8 83
271
+ .b8 112
272
+ .b8 101
273
+ .b8 99
274
+ .b8 70
275
+ .b8 111
276
+ .b8 114
277
+ .b8 103
278
+ .b8 101
279
+ .b8 45
280
+ .b8 101
281
+ .b8 120
282
+ .b8 116
283
+ .b8 47
284
+ .b8 99
285
+ .b8 97
286
+ .b8 99
287
+ .b8 104
288
+ .b8 101
289
+ .b8 47
290
+ .b8 99
291
+ .b8 111
292
+ .b8 109
293
+ .b8 112
294
+ .b8 105
295
+ .b8 108
296
+ .b8 101
297
+ .b8 100
298
+ .b8 95
299
+ .b8 107
300
+ .b8 101
301
+ .b8 114
302
+ .b8 110
303
+ .b8 101
304
+ .b8 108
305
+ .b8 115
306
+ .b8 47
307
+ .b8 51
308
+ .b8 53
309
+ .b8 0
310
+ }
311
+ .section .debug_macinfo { }
SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.source ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":18:0)
2
+ #loc56 = loc("in_ptr0"(#loc))
3
+ #loc57 = loc("out_ptr0"(#loc))
4
+ #loc58 = loc("ks0"(#loc))
5
+ #loc59 = loc("ks1"(#loc))
6
+ #loc60 = loc("ks2"(#loc))
7
+ #loc61 = loc("xnumel"(#loc))
8
+ module {
9
+ tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
10
+ %xoffset = tt.get_program_id x : i32 loc(#loc62)
11
+ %xoffset_0 = arith.constant 128 : i32 loc(#loc63)
12
+ %xoffset_1 = arith.constant 128 : i32 loc(#loc63)
13
+ %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc63)
14
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc64)
15
+ %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc65)
16
+ %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc65)
17
+ %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc66)
18
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc66)
19
+ %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc67)
20
+ %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc67)
21
+ %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc67)
22
+ %x1 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc68)
23
+ %x1_8 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc68)
24
+ %x1_9 = arith.divsi %x1, %x1_8 : tensor<128xi64> loc(#loc68)
25
+ %x1_10 = tt.splat %ks1 : i64 -> tensor<128xi64> loc(#loc69)
26
+ %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<128xi64> loc(#loc69)
27
+ %x2 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc70)
28
+ %x2_12 = tt.splat %ks2 : i64 -> tensor<128xi64> loc(#loc70)
29
+ %x2_13 = arith.divsi %x2, %x2_12 : tensor<128xi64> loc(#loc70)
30
+ %tmp0 = arith.constant 1 : i32 loc(#loc71)
31
+ %tmp0_14 = arith.extsi %tmp0 : i32 to i64 loc(#loc71)
32
+ %tmp0_15 = arith.cmpi sge, %tmp0_14, %ks1 : i64 loc(#loc71)
33
+ %tmp0_16 = arith.constant 1 : i32 loc(#loc72)
34
+ %tmp0_17 = arith.constant 1 : i32 loc(#loc72)
35
+ %tmp0_18 = arith.extui %tmp0_15 : i1 to i32 loc(#loc72)
36
+ %tmp0_19 = arith.muli %tmp0_17, %tmp0_18 : i32 loc(#loc72)
37
+ %tmp0_20 = arith.constant 1 : i32 loc(#loc73)
38
+ %tmp0_21 = arith.extsi %tmp0_20 : i32 to i64 loc(#loc73)
39
+ %tmp0_22 = arith.cmpi sgt, %ks1, %tmp0_21 : i64 loc(#loc73)
40
+ %tmp0_23 = arith.extui %tmp0_22 : i1 to i64 loc(#loc74)
41
+ %tmp0_24 = arith.muli %ks1, %tmp0_23 : i64 loc(#loc74)
42
+ %tmp0_25 = arith.extsi %tmp0_19 : i32 to i64 loc(#loc75)
43
+ %tmp0_26 = arith.addi %tmp0_25, %tmp0_24 : i64 loc(#loc75)
44
+ %tmp0_27 = tt.splat %tmp0_26 : i64 -> tensor<128xi64> loc(#loc76)
45
+ %tmp0_28 = arith.muli %x0_7, %tmp0_27 : tensor<128xi64> loc(#loc76)
46
+ %tmp0_29 = arith.addi %x1_11, %tmp0_28 : tensor<128xi64> loc(#loc77)
47
+ %tmp0_30 = arith.constant 1 : i32 loc(#loc78)
48
+ %tmp0_31 = arith.extsi %tmp0_30 : i32 to i64 loc(#loc78)
49
+ %tmp0_32 = arith.cmpi sge, %tmp0_31, %ks0 : i64 loc(#loc78)
50
+ %tmp0_33 = arith.constant 1 : i32 loc(#loc79)
51
+ %tmp0_34 = arith.constant 1 : i32 loc(#loc79)
52
+ %tmp0_35 = arith.extui %tmp0_32 : i1 to i32 loc(#loc79)
53
+ %tmp0_36 = arith.muli %tmp0_34, %tmp0_35 : i32 loc(#loc79)
54
+ %tmp0_37 = arith.constant 1 : i32 loc(#loc80)
55
+ %tmp0_38 = arith.extsi %tmp0_37 : i32 to i64 loc(#loc80)
56
+ %tmp0_39 = arith.cmpi sgt, %ks0, %tmp0_38 : i64 loc(#loc80)
57
+ %tmp0_40 = arith.extui %tmp0_39 : i1 to i64 loc(#loc81)
58
+ %tmp0_41 = arith.muli %ks0, %tmp0_40 : i64 loc(#loc81)
59
+ %tmp0_42 = arith.extsi %tmp0_36 : i32 to i64 loc(#loc82)
60
+ %tmp0_43 = arith.addi %tmp0_42, %tmp0_41 : i64 loc(#loc82)
61
+ %tmp0_44 = tt.splat %tmp0_43 : i64 -> tensor<128xi64> loc(#loc83)
62
+ %tmp0_45 = arith.muli %x2_13, %tmp0_44 : tensor<128xi64> loc(#loc83)
63
+ %tmp0_46 = arith.constant 1 : i32 loc(#loc84)
64
+ %tmp0_47 = arith.extsi %tmp0_46 : i32 to i64 loc(#loc84)
65
+ %tmp0_48 = arith.cmpi sge, %tmp0_47, %ks1 : i64 loc(#loc84)
66
+ %tmp0_49 = arith.constant 1 : i32 loc(#loc85)
67
+ %tmp0_50 = arith.constant 1 : i32 loc(#loc85)
68
+ %tmp0_51 = arith.extui %tmp0_48 : i1 to i32 loc(#loc85)
69
+ %tmp0_52 = arith.muli %tmp0_50, %tmp0_51 : i32 loc(#loc85)
70
+ %tmp0_53 = arith.constant 1 : i32 loc(#loc86)
71
+ %tmp0_54 = arith.extsi %tmp0_53 : i32 to i64 loc(#loc86)
72
+ %tmp0_55 = arith.cmpi sgt, %ks1, %tmp0_54 : i64 loc(#loc86)
73
+ %tmp0_56 = arith.extui %tmp0_55 : i1 to i64 loc(#loc87)
74
+ %tmp0_57 = arith.muli %ks1, %tmp0_56 : i64 loc(#loc87)
75
+ %tmp0_58 = arith.extsi %tmp0_52 : i32 to i64 loc(#loc88)
76
+ %tmp0_59 = arith.addi %tmp0_58, %tmp0_57 : i64 loc(#loc88)
77
+ %tmp0_60 = tt.splat %tmp0_59 : i64 -> tensor<128xi64> loc(#loc89)
78
+ %tmp0_61 = arith.muli %tmp0_45, %tmp0_60 : tensor<128xi64> loc(#loc89)
79
+ %tmp0_62 = arith.addi %tmp0_29, %tmp0_61 : tensor<128xi64> loc(#loc90)
80
+ %tmp0_63 = tt.splat %in_ptr0 : !tt.ptr<i64> -> tensor<128x!tt.ptr<i64>> loc(#loc91)
81
+ %tmp0_64 = tt.addptr %tmp0_63, %tmp0_62 : tensor<128x!tt.ptr<i64>>, tensor<128xi64> loc(#loc91)
82
+ %tmp0_65 = tt.load %tmp0_64, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr<i64>> loc(#loc92)
83
+ %tmp1 = arith.trunci %tmp0_65 : tensor<128xi64> to tensor<128xi32> loc(#loc93)
84
+ %c1_i32 = arith.constant 1 : i32 loc(#loc33)
85
+ %0 = arith.extsi %c1_i32 : i32 to i64 loc(#loc33)
86
+ %1 = arith.cmpi sge, %0, %ks0 : i64 loc(#loc33)
87
+ %c1_i32_66 = arith.constant 1 : i32 loc(#loc34)
88
+ %c1_i32_67 = arith.constant 1 : i32 loc(#loc34)
89
+ %2 = arith.extui %1 : i1 to i32 loc(#loc34)
90
+ %3 = arith.muli %c1_i32_67, %2 : i32 loc(#loc34)
91
+ %c1_i32_68 = arith.constant 1 : i32 loc(#loc35)
92
+ %4 = arith.extsi %c1_i32_68 : i32 to i64 loc(#loc35)
93
+ %5 = arith.cmpi sgt, %ks0, %4 : i64 loc(#loc35)
94
+ %6 = arith.extui %5 : i1 to i64 loc(#loc36)
95
+ %7 = arith.muli %ks0, %6 : i64 loc(#loc36)
96
+ %8 = arith.extsi %3 : i32 to i64 loc(#loc37)
97
+ %9 = arith.addi %8, %7 : i64 loc(#loc37)
98
+ %10 = tt.splat %9 : i64 -> tensor<128xi64> loc(#loc38)
99
+ %11 = arith.muli %x1_11, %10 : tensor<128xi64> loc(#loc38)
100
+ %12 = arith.addi %x0_7, %11 : tensor<128xi64> loc(#loc39)
101
+ %c1_i32_69 = arith.constant 1 : i32 loc(#loc40)
102
+ %13 = arith.extsi %c1_i32_69 : i32 to i64 loc(#loc40)
103
+ %14 = arith.cmpi sge, %13, %ks0 : i64 loc(#loc40)
104
+ %c1_i32_70 = arith.constant 1 : i32 loc(#loc41)
105
+ %c1_i32_71 = arith.constant 1 : i32 loc(#loc41)
106
+ %15 = arith.extui %14 : i1 to i32 loc(#loc41)
107
+ %16 = arith.muli %c1_i32_71, %15 : i32 loc(#loc41)
108
+ %c1_i32_72 = arith.constant 1 : i32 loc(#loc42)
109
+ %17 = arith.extsi %c1_i32_72 : i32 to i64 loc(#loc42)
110
+ %18 = arith.cmpi sgt, %ks0, %17 : i64 loc(#loc42)
111
+ %19 = arith.extui %18 : i1 to i64 loc(#loc43)
112
+ %20 = arith.muli %ks0, %19 : i64 loc(#loc43)
113
+ %21 = arith.extsi %16 : i32 to i64 loc(#loc44)
114
+ %22 = arith.addi %21, %20 : i64 loc(#loc44)
115
+ %23 = tt.splat %22 : i64 -> tensor<128xi64> loc(#loc45)
116
+ %24 = arith.muli %x2_13, %23 : tensor<128xi64> loc(#loc45)
117
+ %c1_i32_73 = arith.constant 1 : i32 loc(#loc46)
118
+ %25 = arith.extsi %c1_i32_73 : i32 to i64 loc(#loc46)
119
+ %26 = arith.cmpi sge, %25, %ks1 : i64 loc(#loc46)
120
+ %c1_i32_74 = arith.constant 1 : i32 loc(#loc47)
121
+ %c1_i32_75 = arith.constant 1 : i32 loc(#loc47)
122
+ %27 = arith.extui %26 : i1 to i32 loc(#loc47)
123
+ %28 = arith.muli %c1_i32_75, %27 : i32 loc(#loc47)
124
+ %c1_i32_76 = arith.constant 1 : i32 loc(#loc48)
125
+ %29 = arith.extsi %c1_i32_76 : i32 to i64 loc(#loc48)
126
+ %30 = arith.cmpi sgt, %ks1, %29 : i64 loc(#loc48)
127
+ %31 = arith.extui %30 : i1 to i64 loc(#loc49)
128
+ %32 = arith.muli %ks1, %31 : i64 loc(#loc49)
129
+ %33 = arith.extsi %28 : i32 to i64 loc(#loc50)
130
+ %34 = arith.addi %33, %32 : i64 loc(#loc50)
131
+ %35 = tt.splat %34 : i64 -> tensor<128xi64> loc(#loc51)
132
+ %36 = arith.muli %24, %35 : tensor<128xi64> loc(#loc51)
133
+ %37 = arith.addi %12, %36 : tensor<128xi64> loc(#loc52)
134
+ %38 = tt.splat %out_ptr0 : !tt.ptr<i32> -> tensor<128x!tt.ptr<i32>> loc(#loc53)
135
+ %39 = tt.addptr %38, %37 : tensor<128x!tt.ptr<i32>>, tensor<128xi64> loc(#loc53)
136
+ tt.store %39, %tmp1, %xmask_5 : tensor<128x!tt.ptr<i32>> loc(#loc54)
137
+ tt.return loc(#loc55)
138
+ } loc(#loc)
139
+ } loc(#loc)
140
+ #loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:28)
141
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:33)
142
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:36)
143
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:23)
144
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":21:21)
145
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":22:19)
146
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:21)
147
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:28)
148
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":24:19)
149
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:54)
150
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:46)
151
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:80)
152
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:71)
153
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:62)
154
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:39)
155
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:35)
156
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:106)
157
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:98)
158
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:132)
159
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:123)
160
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:114)
161
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:91)
162
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:153)
163
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:145)
164
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:179)
165
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:170)
166
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:161)
167
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:138)
168
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:87)
169
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:30)
170
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:186)
171
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":26:19)
172
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:49)
173
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:41)
174
+ #loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:75)
175
+ #loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:66)
176
+ #loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:57)
177
+ #loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:34)
178
+ #loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:30)
179
+ #loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:101)
180
+ #loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:93)
181
+ #loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:127)
182
+ #loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:118)
183
+ #loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:109)
184
+ #loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:86)
185
+ #loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:148)
186
+ #loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:140)
187
+ #loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:174)
188
+ #loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:165)
189
+ #loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:156)
190
+ #loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:133)
191
+ #loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:82)
192
+ #loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:25)
193
+ #loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:187)
194
+ #loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:4)
195
+ #loc62 = loc("xoffset"(#loc1))
196
+ #loc63 = loc("xoffset"(#loc2))
197
+ #loc64 = loc("xindex"(#loc3))
198
+ #loc65 = loc("xindex"(#loc4))
199
+ #loc66 = loc("xmask"(#loc5))
200
+ #loc67 = loc("x0"(#loc6))
201
+ #loc68 = loc("x1"(#loc7))
202
+ #loc69 = loc("x1"(#loc8))
203
+ #loc70 = loc("x2"(#loc9))
204
+ #loc71 = loc("tmp0"(#loc10))
205
+ #loc72 = loc("tmp0"(#loc11))
206
+ #loc73 = loc("tmp0"(#loc12))
207
+ #loc74 = loc("tmp0"(#loc13))
208
+ #loc75 = loc("tmp0"(#loc14))
209
+ #loc76 = loc("tmp0"(#loc15))
210
+ #loc77 = loc("tmp0"(#loc16))
211
+ #loc78 = loc("tmp0"(#loc17))
212
+ #loc79 = loc("tmp0"(#loc18))
213
+ #loc80 = loc("tmp0"(#loc19))
214
+ #loc81 = loc("tmp0"(#loc20))
215
+ #loc82 = loc("tmp0"(#loc21))
216
+ #loc83 = loc("tmp0"(#loc22))
217
+ #loc84 = loc("tmp0"(#loc23))
218
+ #loc85 = loc("tmp0"(#loc24))
219
+ #loc86 = loc("tmp0"(#loc25))
220
+ #loc87 = loc("tmp0"(#loc26))
221
+ #loc88 = loc("tmp0"(#loc27))
222
+ #loc89 = loc("tmp0"(#loc28))
223
+ #loc90 = loc("tmp0"(#loc29))
224
+ #loc91 = loc("tmp0"(#loc30))
225
+ #loc92 = loc("tmp0"(#loc31))
226
+ #loc93 = loc("tmp1"(#loc32))
SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttgir ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
2
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":18:0)
3
+ #loc35 = loc("in_ptr0"(#loc))
4
+ #loc36 = loc("out_ptr0"(#loc))
5
+ #loc37 = loc("ks0"(#loc))
6
+ #loc38 = loc("ks1"(#loc))
7
+ #loc39 = loc("ks2"(#loc))
8
+ #loc40 = loc("xnumel"(#loc))
9
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
10
+ tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
11
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
12
+ %c1_i64 = arith.constant 1 : i64 loc(#loc1)
13
+ %xoffset = tt.get_program_id x : i32 loc(#loc41)
14
+ %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc42)
15
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc43)
16
+ %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32, #blocked> loc(#loc44)
17
+ %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32, #blocked> loc(#loc44)
18
+ %xmask = tt.splat %xnumel : i32 -> tensor<128xi32, #blocked> loc(#loc45)
19
+ %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32, #blocked> loc(#loc45)
20
+ %x0 = arith.extsi %xindex_2 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> loc(#loc46)
21
+ %x0_4 = tt.splat %ks0 : i64 -> tensor<128xi64, #blocked> loc(#loc46)
22
+ %x0_5 = arith.remsi %x0, %x0_4 : tensor<128xi64, #blocked> loc(#loc46)
23
+ %x1 = arith.divsi %x0, %x0_4 : tensor<128xi64, #blocked> loc(#loc47)
24
+ %x1_6 = tt.splat %ks1 : i64 -> tensor<128xi64, #blocked> loc(#loc48)
25
+ %x1_7 = arith.remsi %x1, %x1_6 : tensor<128xi64, #blocked> loc(#loc48)
26
+ %x2 = tt.splat %ks2 : i64 -> tensor<128xi64, #blocked> loc(#loc49)
27
+ %x2_8 = arith.divsi %x0, %x2 : tensor<128xi64, #blocked> loc(#loc49)
28
+ %tmp0 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc50)
29
+ %tmp0_9 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc51)
30
+ %tmp0_10 = arith.extui %tmp0_9 : i1 to i64 loc(#loc52)
31
+ %tmp0_11 = arith.muli %ks1, %tmp0_10 : i64 loc(#loc52)
32
+ %tmp0_12 = arith.extui %tmp0 : i1 to i64 loc(#loc68)
33
+ %tmp0_13 = arith.addi %tmp0_12, %tmp0_11 : i64 loc(#loc53)
34
+ %tmp0_14 = tt.splat %tmp0_13 : i64 -> tensor<128xi64, #blocked> loc(#loc55)
35
+ %tmp0_15 = arith.muli %x0_5, %tmp0_14 : tensor<128xi64, #blocked> loc(#loc55)
36
+ %tmp0_16 = arith.addi %x1_7, %tmp0_15 : tensor<128xi64, #blocked> loc(#loc56)
37
+ %tmp0_17 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc57)
38
+ %tmp0_18 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc58)
39
+ %tmp0_19 = arith.extui %tmp0_18 : i1 to i64 loc(#loc59)
40
+ %tmp0_20 = arith.muli %ks0, %tmp0_19 : i64 loc(#loc59)
41
+ %tmp0_21 = arith.extui %tmp0_17 : i1 to i64 loc(#loc69)
42
+ %tmp0_22 = arith.addi %tmp0_21, %tmp0_20 : i64 loc(#loc60)
43
+ %tmp0_23 = tt.splat %tmp0_22 : i64 -> tensor<128xi64, #blocked> loc(#loc62)
44
+ %tmp0_24 = arith.muli %x2_8, %tmp0_23 : tensor<128xi64, #blocked> loc(#loc62)
45
+ %tmp0_25 = arith.muli %tmp0_24, %tmp0_14 : tensor<128xi64, #blocked> loc(#loc63)
46
+ %tmp0_26 = arith.addi %tmp0_16, %tmp0_25 : tensor<128xi64, #blocked> loc(#loc64)
47
+ %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr<i64> -> tensor<128x!tt.ptr<i64>, #blocked> loc(#loc65)
48
+ %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<128x!tt.ptr<i64>, #blocked>, tensor<128xi64, #blocked> loc(#loc65)
49
+ %tmp0_29 = tt.load %tmp0_28, %xmask_3 evictionPolicy = evict_last : tensor<128x!tt.ptr<i64>, #blocked> loc(#loc66)
50
+ %tmp1 = arith.trunci %tmp0_29 : tensor<128xi64, #blocked> to tensor<128xi32, #blocked> loc(#loc67)
51
+ %0 = arith.muli %x1_7, %tmp0_23 : tensor<128xi64, #blocked> loc(#loc29)
52
+ %1 = arith.addi %x0_5, %0 : tensor<128xi64, #blocked> loc(#loc30)
53
+ %2 = arith.addi %1, %tmp0_25 : tensor<128xi64, #blocked> loc(#loc31)
54
+ %3 = tt.splat %out_ptr0 : !tt.ptr<i32> -> tensor<128x!tt.ptr<i32>, #blocked> loc(#loc32)
55
+ %4 = tt.addptr %3, %2 : tensor<128x!tt.ptr<i32>, #blocked>, tensor<128xi64, #blocked> loc(#loc32)
56
+ tt.store %4, %tmp1, %xmask_3 : tensor<128x!tt.ptr<i32>, #blocked> loc(#loc33)
57
+ tt.return loc(#loc34)
58
+ } loc(#loc)
59
+ } loc(#loc)
60
+ #loc1 = loc(unknown)
61
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:28)
62
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:33)
63
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:36)
64
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:23)
65
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":21:21)
66
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":22:19)
67
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:21)
68
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:28)
69
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":24:19)
70
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:54)
71
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:80)
72
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:71)
73
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:62)
74
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:46)
75
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:39)
76
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:35)
77
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:106)
78
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:132)
79
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:123)
80
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:114)
81
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:98)
82
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:91)
83
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:138)
84
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:87)
85
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:30)
86
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:186)
87
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":26:19)
88
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:34)
89
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:30)
90
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:82)
91
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:25)
92
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:187)
93
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:4)
94
+ #loc41 = loc("xoffset"(#loc2))
95
+ #loc42 = loc("xoffset"(#loc3))
96
+ #loc43 = loc("xindex"(#loc4))
97
+ #loc44 = loc("xindex"(#loc5))
98
+ #loc45 = loc("xmask"(#loc6))
99
+ #loc46 = loc("x0"(#loc7))
100
+ #loc47 = loc("x1"(#loc8))
101
+ #loc48 = loc("x1"(#loc9))
102
+ #loc49 = loc("x2"(#loc10))
103
+ #loc50 = loc("tmp0"(#loc11))
104
+ #loc51 = loc("tmp0"(#loc12))
105
+ #loc52 = loc("tmp0"(#loc13))
106
+ #loc53 = loc("tmp0"(#loc14))
107
+ #loc54 = loc("tmp0"(#loc15))
108
+ #loc55 = loc("tmp0"(#loc16))
109
+ #loc56 = loc("tmp0"(#loc17))
110
+ #loc57 = loc("tmp0"(#loc18))
111
+ #loc58 = loc("tmp0"(#loc19))
112
+ #loc59 = loc("tmp0"(#loc20))
113
+ #loc60 = loc("tmp0"(#loc21))
114
+ #loc61 = loc("tmp0"(#loc22))
115
+ #loc62 = loc("tmp0"(#loc23))
116
+ #loc63 = loc("tmp0"(#loc24))
117
+ #loc64 = loc("tmp0"(#loc25))
118
+ #loc65 = loc("tmp0"(#loc26))
119
+ #loc66 = loc("tmp0"(#loc27))
120
+ #loc67 = loc("tmp1"(#loc28))
121
+ #loc68 = loc(fused[#loc53, #loc54])
122
+ #loc69 = loc(fused[#loc60, #loc61])
SpecForge-ext/cache/compiled_kernels/triton/6/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttir ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":18:0)
2
+ #loc35 = loc("in_ptr0"(#loc))
3
+ #loc36 = loc("out_ptr0"(#loc))
4
+ #loc37 = loc("ks0"(#loc))
5
+ #loc38 = loc("ks1"(#loc))
6
+ #loc39 = loc("ks2"(#loc))
7
+ #loc40 = loc("xnumel"(#loc))
8
+ module {
9
+ tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr<i64> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} {
10
+ %c1_i64 = arith.constant 1 : i64 loc(#loc1)
11
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
12
+ %xoffset = tt.get_program_id x : i32 loc(#loc41)
13
+ %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc42)
14
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc43)
15
+ %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32> loc(#loc44)
16
+ %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32> loc(#loc44)
17
+ %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc45)
18
+ %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32> loc(#loc45)
19
+ %x0 = arith.extsi %xindex_2 : tensor<128xi32> to tensor<128xi64> loc(#loc46)
20
+ %x0_4 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc46)
21
+ %x0_5 = arith.remsi %x0, %x0_4 : tensor<128xi64> loc(#loc46)
22
+ %x1 = arith.divsi %x0, %x0_4 : tensor<128xi64> loc(#loc47)
23
+ %x1_6 = tt.splat %ks1 : i64 -> tensor<128xi64> loc(#loc48)
24
+ %x1_7 = arith.remsi %x1, %x1_6 : tensor<128xi64> loc(#loc48)
25
+ %x2 = tt.splat %ks2 : i64 -> tensor<128xi64> loc(#loc49)
26
+ %x2_8 = arith.divsi %x0, %x2 : tensor<128xi64> loc(#loc49)
27
+ %tmp0 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc50)
28
+ %tmp0_9 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc51)
29
+ %tmp0_10 = arith.extui %tmp0_9 : i1 to i64 loc(#loc52)
30
+ %tmp0_11 = arith.muli %ks1, %tmp0_10 : i64 loc(#loc52)
31
+ %tmp0_12 = arith.extui %tmp0 : i1 to i64 loc(#loc68)
32
+ %tmp0_13 = arith.addi %tmp0_12, %tmp0_11 : i64 loc(#loc53)
33
+ %tmp0_14 = tt.splat %tmp0_13 : i64 -> tensor<128xi64> loc(#loc55)
34
+ %tmp0_15 = arith.muli %x0_5, %tmp0_14 : tensor<128xi64> loc(#loc55)
35
+ %tmp0_16 = arith.addi %x1_7, %tmp0_15 : tensor<128xi64> loc(#loc56)
36
+ %tmp0_17 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc57)
37
+ %tmp0_18 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc58)
38
+ %tmp0_19 = arith.extui %tmp0_18 : i1 to i64 loc(#loc59)
39
+ %tmp0_20 = arith.muli %ks0, %tmp0_19 : i64 loc(#loc59)
40
+ %tmp0_21 = arith.extui %tmp0_17 : i1 to i64 loc(#loc69)
41
+ %tmp0_22 = arith.addi %tmp0_21, %tmp0_20 : i64 loc(#loc60)
42
+ %tmp0_23 = tt.splat %tmp0_22 : i64 -> tensor<128xi64> loc(#loc62)
43
+ %tmp0_24 = arith.muli %x2_8, %tmp0_23 : tensor<128xi64> loc(#loc62)
44
+ %tmp0_25 = arith.muli %tmp0_24, %tmp0_14 : tensor<128xi64> loc(#loc63)
45
+ %tmp0_26 = arith.addi %tmp0_16, %tmp0_25 : tensor<128xi64> loc(#loc64)
46
+ %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr<i64> -> tensor<128x!tt.ptr<i64>> loc(#loc65)
47
+ %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<128x!tt.ptr<i64>>, tensor<128xi64> loc(#loc65)
48
+ %tmp0_29 = tt.load %tmp0_28, %xmask_3 evictionPolicy = evict_last : tensor<128x!tt.ptr<i64>> loc(#loc66)
49
+ %tmp1 = arith.trunci %tmp0_29 : tensor<128xi64> to tensor<128xi32> loc(#loc67)
50
+ %0 = arith.muli %x1_7, %tmp0_23 : tensor<128xi64> loc(#loc29)
51
+ %1 = arith.addi %x0_5, %0 : tensor<128xi64> loc(#loc30)
52
+ %2 = arith.addi %1, %tmp0_25 : tensor<128xi64> loc(#loc31)
53
+ %3 = tt.splat %out_ptr0 : !tt.ptr<i32> -> tensor<128x!tt.ptr<i32>> loc(#loc32)
54
+ %4 = tt.addptr %3, %2 : tensor<128x!tt.ptr<i32>>, tensor<128xi64> loc(#loc32)
55
+ tt.store %4, %tmp1, %xmask_3 : tensor<128x!tt.ptr<i32>> loc(#loc33)
56
+ tt.return loc(#loc34)
57
+ } loc(#loc)
58
+ } loc(#loc)
59
+ #loc1 = loc(unknown)
60
+ #loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:28)
61
+ #loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:33)
62
+ #loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:36)
63
+ #loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:23)
64
+ #loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":21:21)
65
+ #loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":22:19)
66
+ #loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:21)
67
+ #loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:28)
68
+ #loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":24:19)
69
+ #loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:54)
70
+ #loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:80)
71
+ #loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:71)
72
+ #loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:62)
73
+ #loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:46)
74
+ #loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:39)
75
+ #loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:35)
76
+ #loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:106)
77
+ #loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:132)
78
+ #loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:123)
79
+ #loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:114)
80
+ #loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:98)
81
+ #loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:91)
82
+ #loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:138)
83
+ #loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:87)
84
+ #loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:30)
85
+ #loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:186)
86
+ #loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":26:19)
87
+ #loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:34)
88
+ #loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:30)
89
+ #loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:82)
90
+ #loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:25)
91
+ #loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:187)
92
+ #loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:4)
93
+ #loc41 = loc("xoffset"(#loc2))
94
+ #loc42 = loc("xoffset"(#loc3))
95
+ #loc43 = loc("xindex"(#loc4))
96
+ #loc44 = loc("xindex"(#loc5))
97
+ #loc45 = loc("xmask"(#loc6))
98
+ #loc46 = loc("x0"(#loc7))
99
+ #loc47 = loc("x1"(#loc8))
100
+ #loc48 = loc("x1"(#loc9))
101
+ #loc49 = loc("x2"(#loc10))
102
+ #loc50 = loc("tmp0"(#loc11))
103
+ #loc51 = loc("tmp0"(#loc12))
104
+ #loc52 = loc("tmp0"(#loc13))
105
+ #loc53 = loc("tmp0"(#loc14))
106
+ #loc54 = loc("tmp0"(#loc15))
107
+ #loc55 = loc("tmp0"(#loc16))
108
+ #loc56 = loc("tmp0"(#loc17))
109
+ #loc57 = loc("tmp0"(#loc18))
110
+ #loc58 = loc("tmp0"(#loc19))
111
+ #loc59 = loc("tmp0"(#loc20))
112
+ #loc60 = loc("tmp0"(#loc21))
113
+ #loc61 = loc("tmp0"(#loc22))
114
+ #loc62 = loc("tmp0"(#loc23))
115
+ #loc63 = loc("tmp0"(#loc24))
116
+ #loc64 = loc("tmp0"(#loc25))
117
+ #loc65 = loc("tmp0"(#loc26))
118
+ #loc66 = loc("tmp0"(#loc27))
119
+ #loc67 = loc("tmp1"(#loc28))
120
+ #loc68 = loc(fused[#loc53, #loc54])
121
+ #loc69 = loc(fused[#loc60, #loc61])
SpecForge-ext/cache/compiled_kernels/triton/6/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/__grp__triton_red_fused_argmax_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_red_fused_argmax_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.source", "triton_red_fused_argmax_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ttir", "triton_red_fused_argmax_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ttgir", "triton_red_fused_argmax_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.llir", "triton_red_fused_argmax_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ptx", "triton_red_fused_argmax_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.cubin", "triton_red_fused_argmax_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.json"}}
SpecForge-ext/cache/compiled_kernels/triton/6/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.cubin ADDED
Binary file (21.5 kB). View file