Lekr0 commited on
Commit
0b9402c
·
verified ·
1 Parent(s): e686d7b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/__grp__triton_tem_fused_0.json +1 -0
  2. progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json +1 -0
  3. progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir +0 -0
  4. progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx +0 -0
  5. progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source +0 -0
  6. progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir +936 -0
  7. progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir +780 -0
  8. progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/__grp__triton_poi_fused_mul_1.json +1 -0
  9. progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin +0 -0
  10. progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json +1 -0
  11. progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir +89 -0
  12. progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx +357 -0
  13. progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source +130 -0
  14. progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir +105 -0
  15. progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir +104 -0
  16. progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/__grp__triton_tem_fused_0.json +1 -0
  17. progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json +1 -0
  18. progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir +0 -0
  19. progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ptx +0 -0
  20. progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.source +0 -0
  21. progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttgir +0 -0
  22. progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttir +896 -0
  23. progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/__grp__triton_tem_fused_mul_1.json +1 -0
  24. progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.json +1 -0
  25. progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.llir +0 -0
  26. progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ptx +0 -0
  27. progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.source +0 -0
  28. progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttgir +0 -0
  29. progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttir +0 -0
  30. progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/__grp__triton_poi_fused_mul_1.json +1 -0
  31. progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.cubin +0 -0
  32. progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.json +1 -0
  33. progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.llir +171 -0
  34. progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.ptx +491 -0
  35. progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.source +130 -0
  36. progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.ttgir +112 -0
  37. progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.ttir +104 -0
  38. progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/__grp__triton_red_fused_mul_0.json +1 -0
  39. progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.cubin +0 -0
  40. progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.json +1 -0
  41. progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.llir +203 -0
  42. progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ptx +504 -0
  43. progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.source +344 -0
  44. progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ttgir +237 -0
  45. progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ttir +217 -0
  46. progress/github/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/__grp__triton_tem_fused_mul_1.json +1 -0
  47. progress/github/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.json +1 -0
  48. progress/github/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.llir +0 -0
  49. progress/github/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.ptx +0 -0
  50. progress/github/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.source +0 -0
progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/__grp__triton_tem_fused_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json"}}
progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "d6d8579bc758efaefba73018088fcd8f749b5f25c3ab12d02ac240e2311ac409", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 131072, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"}
progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir ADDED
@@ -0,0 +1,936 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
2
+ #blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
3
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":18:0)
4
+ #loc1 = loc(unknown)
5
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":520:16)
6
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":172:41)
7
+ #loc83 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":403:51)
8
+ #loc95 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:34)
9
+ #loc131 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":198:45)
10
+ #mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}>
11
+ #mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}>
12
+ #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
13
+ #shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
14
+ #smem = #ttg.shared_memory
15
+ #loc152 = loc("arg_Q"(#loc))
16
+ #loc153 = loc("arg_K"(#loc))
17
+ #loc154 = loc("arg_V"(#loc))
18
+ #loc155 = loc("arg_LSE"(#loc))
19
+ #loc156 = loc("arg_MAX"(#loc))
20
+ #loc157 = loc("arg_KV_NUM_BLKS"(#loc))
21
+ #loc158 = loc("arg_KV_IDX"(#loc))
22
+ #loc159 = loc("arg_FULL_KV_NUM_BLKS"(#loc))
23
+ #loc160 = loc("arg_FULL_KV_IDX"(#loc))
24
+ #loc161 = loc("out_ptr0"(#loc))
25
+ #loc162 = loc("ks0"(#loc))
26
+ #loc163 = loc("ks1"(#loc))
27
+ #loc164 = loc("ks2"(#loc))
28
+ #loc165 = loc("ks3"(#loc))
29
+ #loc166 = loc("ks4"(#loc))
30
+ #loc200 = loc(callsite(#loc41 at #loc42))
31
+ #loc239 = loc("m_ij"(#loc83))
32
+ #loc249 = loc("l_i"(#loc95))
33
+ #loc283 = loc(callsite(#loc41 at #loc131))
34
+ #loc345 = loc(callsite(#loc239 at #loc200))
35
+ #loc355 = loc(callsite(#loc249 at #loc200))
36
+ #loc374 = loc(callsite(#loc239 at #loc283))
37
+ #loc384 = loc(callsite(#loc249 at #loc283))
38
+ #loc406 = loc(callsite(#loc1 at #loc345))
39
+ #loc408 = loc(callsite(#loc1 at #loc355))
40
+ #loc436 = loc(callsite(#loc1 at #loc374))
41
+ #loc438 = loc(callsite(#loc1 at #loc384))
42
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
43
+ tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc))) attributes {noinline = false} {
44
+ %cst = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1)
45
+ %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #mma> loc(#loc1)
46
+ %cst_1 = arith.constant dense<1> : tensor<128x1xi32, #mma> loc(#loc1)
47
+ %cst_2 = arith.constant dense<1> : tensor<1x64xi32, #mma> loc(#loc1)
48
+ %cst_3 = arith.constant dense<false> : tensor<128x64xi1, #mma> loc(#loc1)
49
+ %cst_4 = arith.constant dense<16> : tensor<1x64xi32, #mma> loc(#loc1)
50
+ %cst_5 = arith.constant dense<16> : tensor<128x1xi32, #mma> loc(#loc1)
51
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1)
52
+ %cst_7 = arith.constant dense<1024> : tensor<64x1xi32, #blocked> loc(#loc1)
53
+ %cst_8 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1)
54
+ %cst_9 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
55
+ %c2_i32 = arith.constant 2 : i32 loc(#loc1)
56
+ %c4_i32 = arith.constant 4 : i32 loc(#loc1)
57
+ %c32_i32 = arith.constant 32 : i32 loc(#loc1)
58
+ %c1_i32 = arith.constant 1 : i32 loc(#loc1)
59
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
60
+ %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
61
+ %cst_10 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16, #blocked> loc(#loc1)
62
+ %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16, #blocked> loc(#loc1)
63
+ %c64_i32 = arith.constant 64 : i32 loc(#loc1)
64
+ %c63_i32 = arith.constant 63 : i32 loc(#loc1)
65
+ %c0_i32 = arith.constant 0 : i32 loc(#loc1)
66
+ %cst_12 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
67
+ %cst_13 = arith.constant dense<1.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
68
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1)
69
+ %cst_15 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1)
70
+ %cst_16 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1)
71
+ %cst_17 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1)
72
+ %cst_18 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
73
+ %c-1_i32 = arith.constant -1 : i32 loc(#loc1)
74
+ %c3_i32 = arith.constant 3 : i32 loc(#loc1)
75
+ %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc2)
76
+ %q_start = tt.get_program_id x : i32 loc(#loc167)
77
+ %off_zq = tt.get_program_id y : i32 loc(#loc168)
78
+ %off_hq = tt.get_program_id z : i32 loc(#loc169)
79
+ %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc170)
80
+ %q_offset = arith.muli %off_zq, %0 : i32 loc(#loc171)
81
+ %q_offset_19 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc172)
82
+ %q_offset_20 = arith.addi %q_offset, %q_offset_19 : i32 loc(#loc173)
83
+ %k_offset = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc174)
84
+ %Q = tt.addptr %arg_Q, %q_offset_20 : !tt.ptr<bf16>, i32 loc(#loc175)
85
+ %K = tt.addptr %arg_K, %k_offset : !tt.ptr<bf16>, i32 loc(#loc176)
86
+ %V = tt.addptr %arg_V, %k_offset : !tt.ptr<bf16>, i32 loc(#loc177)
87
+ %sparse_kv_idx_offset = arith.muli %q_start, %ks4 : i32 loc(#loc178)
88
+ %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc179)
89
+ %offs_m_21 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc180)
90
+ %offs_m_22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc180)
91
+ %offs_m_23 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked1> loc(#loc180)
92
+ %offs_m_24 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc181)
93
+ %offs_m_25 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc181)
94
+ %offs_m_26 = tt.splat %offs_m : i32 -> tensor<128xi32, #blocked1> loc(#loc181)
95
+ %offs_m_27 = arith.addi %offs_m_24, %offs_m_21 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc181)
96
+ %offs_m_28 = arith.addi %offs_m_25, %offs_m_22 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc181)
97
+ %offs_m_29 = arith.addi %offs_m_26, %offs_m_23 : tensor<128xi32, #blocked1> loc(#loc181)
98
+ %ptr = tt.expand_dims %offs_m_27 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc297)
99
+ %ptr_30 = tt.expand_dims %offs_m_28 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc297)
100
+ %ptr_31 = arith.muli %ptr, %cst_8 : tensor<128x1xi32, #blocked> loc(#loc298)
101
+ %ptr_32 = tt.splat %Q : !tt.ptr<bf16> -> tensor<128x1x!tt.ptr<bf16>, #blocked> loc(#loc299)
102
+ %ptr_33 = tt.addptr %ptr_32, %ptr_31 : tensor<128x1x!tt.ptr<bf16>, #blocked>, tensor<128x1xi32, #blocked> loc(#loc299)
103
+ %ptr_34 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc300)
104
+ %ptr_35 = tt.expand_dims %ptr_34 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc300)
105
+ %ptr_36 = tt.broadcast %ptr_33 : tensor<128x1x!tt.ptr<bf16>, #blocked> -> tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc301)
106
+ %ptr_37 = tt.broadcast %ptr_35 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc301)
107
+ %ptr_38 = tt.addptr %ptr_36, %ptr_37 : tensor<128x128x!tt.ptr<bf16>, #blocked>, tensor<128x128xi32, #blocked> loc(#loc301)
108
+ %q = tt.splat %ks0 : i32 -> tensor<128x1xi32, #blocked> loc(#loc302)
109
+ %q_39 = tt.splat %ks0 : i32 -> tensor<128x1xi32, #mma> loc(#loc302)
110
+ %q_40 = arith.cmpi slt, %ptr, %q : tensor<128x1xi32, #blocked> loc(#loc302)
111
+ %q_41 = tt.broadcast %q_40 : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc303)
112
+ %q_42 = tt.load %ptr_38, %q_41, %cst_10 : tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc303)
113
+ %q_43 = ttg.local_alloc %q_42 : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc303)
114
+ %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc188)
115
+ %kv_start = tt.load %kv_indices : !tt.ptr<i32> loc(#loc189)
116
+ %kv_start_44 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc190)
117
+ %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc191)
118
+ %kv_num_blocks_45 = tt.load %kv_num_blocks : !tt.ptr<i32> loc(#loc192)
119
+ %block_n_end = arith.muli %kv_num_blocks_45, %c2_i32 : i32 loc(#loc193)
120
+ %block_n_end_46 = arith.addi %ks1, %c63_i32 : i32 loc(#loc304)
121
+ %block_n_end_47 = arith.divsi %block_n_end_46, %c64_i32 : i32 loc(#loc305)
122
+ %block_n_end_48 = arith.maxsi %block_n_end_47, %c1_i32 : i32 loc(#loc195)
123
+ %block_n_end_49 = arith.minsi %block_n_end, %block_n_end_48 : i32 loc(#loc196)
124
+ %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc197)
125
+ %offs_n_50 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc197)
126
+ %offs_n_51 = tt.splat %kv_start_44 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198)
127
+ %offs_n_52 = arith.addi %offs_n_51, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198)
128
+ %1 = tt.expand_dims %offs_n_52 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc39)
129
+ %ptr_53 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>, #blocked> loc(#loc393)
130
+ %ptr_54 = tt.broadcast %ptr_35 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc394)
131
+ %k = tt.splat %ks1 : i32 -> tensor<64x1xi32, #blocked> loc(#loc395)
132
+ %m = arith.remsi %ptr_30, %q_39 : tensor<128x1xi32, #mma> loc(#loc396)
133
+ %n = tt.splat %ks1 : i32 -> tensor<1x64xi32, #mma> loc(#loc397)
134
+ %tmp3 = arith.cmpi slt, %m, %cst_0 : tensor<128x1xi32, #mma> loc(#loc309)
135
+ %tmp5 = tt.broadcast %m : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc310)
136
+ %tmp6 = tt.broadcast %tmp3 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc311)
137
+ %tmp7 = arith.cmpi sge, %m, %cst_0 : tensor<128x1xi32, #mma> loc(#loc312)
138
+ %tmp9 = tt.broadcast %tmp7 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc313)
139
+ %tmp14 = arith.remsi %m, %cst_5 : tensor<128x1xi32, #mma> loc(#loc314)
140
+ %tmp14_55 = arith.cmpi ne, %tmp14, %cst_0 : tensor<128x1xi32, #mma> loc(#loc315)
141
+ %tmp14_56 = arith.divsi %m, %cst_5 : tensor<128x1xi32, #mma> loc(#loc316)
142
+ %tmp14_57 = arith.subi %tmp14_56, %cst_1 : tensor<128x1xi32, #mma> loc(#loc317)
143
+ %tmp14_58 = arith.select %tmp14_55, %tmp14_57, %tmp14_56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc318)
144
+ %tmp14_59 = arith.select %tmp3, %tmp14_58, %tmp14_56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc319)
145
+ %tmp17 = tt.broadcast %tmp14_59 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc320)
146
+ %ptr_60 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>, #blocked> loc(#loc398)
147
+ %k_61 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc399)
148
+ %v = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc400)
149
+ %kv_offset = arith.cmpi sgt, %block_n_end_49, %c0_i32 : i32 loc(#loc462)
150
+ %offs_n_load = tt.splat %kv_start_44 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
151
+ %offs_n_load_62 = arith.addi %offs_n_load, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
152
+ %ptr_63 = tt.expand_dims %offs_n_load_62 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
153
+ %ptr_64 = arith.muli %ptr_63, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
154
+ %ptr_65 = tt.addptr %ptr_53, %ptr_64 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
155
+ %ptr_66 = tt.broadcast %ptr_65 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
156
+ %ptr_67 = tt.addptr %ptr_66, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
157
+ %k_68 = arith.cmpi slt, %ptr_63, %k : tensor<64x1xi32, #blocked> loc(#loc395)
158
+ %k_69 = tt.broadcast %k_68 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
159
+ %k_70 = ttg.memdesc_index %k_61[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
160
+ %kv_offset_71 = tt.splat %kv_offset : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
161
+ %kv_offset_72 = arith.andi %kv_offset_71, %k_69 : tensor<64x128xi1, #blocked> loc(#loc462)
162
+ %k_73 = ttg.async_copy_global_to_local %ptr_67, %k_70 mask %kv_offset_72 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
163
+ %k_74 = ttg.async_commit_group tokens %k_73 loc(#loc399)
164
+ %ptr_75 = tt.addptr %ptr_60, %ptr_64 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
165
+ %ptr_76 = tt.broadcast %ptr_75 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
166
+ %ptr_77 = tt.addptr %ptr_76, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
167
+ %v_78 = ttg.memdesc_index %v[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
168
+ %v_79 = ttg.async_copy_global_to_local %ptr_77, %v_78 mask %kv_offset_72 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
169
+ %v_80 = ttg.async_commit_group tokens %v_79 loc(#loc400)
170
+ %kv_offset_81 = arith.cmpi sgt, %block_n_end_49, %c1_i32 : i32 loc(#loc462)
171
+ %kv_base_offset = arith.addi %kv_start_44, %c64_i32 : i32 loc(#loc324)
172
+ %offs_n_load_82 = tt.splat %kv_base_offset : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
173
+ %offs_n_load_83 = arith.addi %offs_n_load_82, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
174
+ %ptr_84 = tt.expand_dims %offs_n_load_83 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
175
+ %ptr_85 = arith.muli %ptr_84, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
176
+ %ptr_86 = tt.addptr %ptr_53, %ptr_85 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
177
+ %ptr_87 = tt.broadcast %ptr_86 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
178
+ %ptr_88 = tt.addptr %ptr_87, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
179
+ %k_89 = arith.cmpi slt, %ptr_84, %k : tensor<64x1xi32, #blocked> loc(#loc395)
180
+ %k_90 = tt.broadcast %k_89 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
181
+ %k_91 = ttg.memdesc_index %k_61[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
182
+ %kv_offset_92 = tt.splat %kv_offset_81 : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
183
+ %kv_offset_93 = arith.andi %kv_offset_92, %k_90 : tensor<64x128xi1, #blocked> loc(#loc462)
184
+ %k_94 = ttg.async_copy_global_to_local %ptr_88, %k_91 mask %kv_offset_93 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
185
+ %k_95 = ttg.async_commit_group tokens %k_94 loc(#loc399)
186
+ %ptr_96 = tt.addptr %ptr_60, %ptr_85 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
187
+ %ptr_97 = tt.broadcast %ptr_96 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
188
+ %ptr_98 = tt.addptr %ptr_97, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
189
+ %v_99 = ttg.memdesc_index %v[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
190
+ %v_100 = ttg.async_copy_global_to_local %ptr_98, %v_99 mask %kv_offset_93 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
191
+ %v_101 = ttg.async_commit_group tokens %v_100 loc(#loc400)
192
+ ttng.fence_async_shared {bCluster = false} loc(#loc325)
193
+ %kv_offset_102:12 = scf.for %kv_offset_173 = %c0_i32 to %block_n_end_49 step %c1_i32 iter_args(%acc_174 = %cst_14, %arg17 = %cst_12, %arg18 = %cst_18, %arg19 = %c64_i32, %arg20 = %1, %arg21 = %c1_i32, %arg22 = %c-1_i32, %k_175 = %k_74, %k_176 = %k_95, %v_177 = %v_80, %v_178 = %v_101, %arg27 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 {
194
+ %kv_offset_179 = arith.subi %block_n_end_49, %c2_i32 : i32 loc(#loc462)
195
+ %kv_offset_180 = arith.cmpi slt, %kv_offset_173, %kv_offset_179 : i32 loc(#loc462)
196
+ %kv_offset_181 = arith.subi %block_n_end_49, %c1_i32 : i32 loc(#loc462)
197
+ %kv_offset_182 = arith.cmpi slt, %kv_offset_173, %kv_offset_181 : i32 loc(#loc462)
198
+ %kv_offset_183 = arith.addi %arg22, %c1_i32 : i32 loc(#loc462)
199
+ %kv_offset_184 = arith.cmpi sge, %kv_offset_183, %c3_i32 : i32 loc(#loc462)
200
+ %kv_offset_185 = arith.select %kv_offset_184, %c0_i32, %kv_offset_183 : i32 loc(#loc462)
201
+ %k_186 = ttg.async_wait %k_175, %v_177 {num = 2 : i32} loc(#loc399)
202
+ %k_187 = ttg.memdesc_index %k_61[%kv_offset_185] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
203
+ %k_188 = ttg.memdesc_trans %k_187 {order = array<i32: 1, 0>} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc326)
204
+ %qk = ttng.warp_group_dot %q_43, %k_188, %cst_6 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc325)
205
+ %qk_189:4 = ttng.warp_group_dot_wait %qk, %q_43, %k_188, %acc_174 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc325)
206
+ %qk_190 = arith.mulf %qk_189#0, %cst_15 : tensor<128x64xf32, #mma> loc(#loc327)
207
+ %n_191 = arith.remsi %arg20, %n : tensor<1x64xi32, #mma> loc(#loc397)
208
+ %post_mod_scores = arith.cmpi slt, %arg20, %n : tensor<1x64xi32, #mma> loc(#loc328)
209
+ %post_mod_scores_192 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc329)
210
+ %post_mod_scores_193 = arith.select %post_mod_scores_192, %qk_190, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc329)
211
+ %tmp5_194 = tt.broadcast %n_191 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc310)
212
+ %tmp5_195 = arith.cmpi sle, %tmp5_194, %tmp5 : tensor<128x64xi32, #mma> loc(#loc310)
213
+ %tmp6_196 = arith.andi %tmp6, %tmp5_195 : tensor<128x64xi1, #mma> loc(#loc311)
214
+ %tmp8 = arith.cmpi slt, %n_191, %cst : tensor<1x64xi32, #mma> loc(#loc330)
215
+ %tmp9_197 = tt.broadcast %tmp8 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc313)
216
+ %tmp9_198 = arith.andi %tmp9, %tmp9_197 : tensor<128x64xi1, #mma> loc(#loc313)
217
+ %tmp10 = arith.extui %tmp8 : tensor<1x64xi1, #mma> to tensor<1x64xi32, #mma> loc(#loc331)
218
+ %tmp10_199 = arith.cmpi eq, %tmp10, %cst : tensor<1x64xi32, #mma> loc(#loc331)
219
+ %tmp11 = tt.broadcast %tmp10_199 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc332)
220
+ %tmp11_200 = arith.andi %tmp9, %tmp11 : tensor<128x64xi1, #mma> loc(#loc332)
221
+ %tmp16 = arith.remsi %n_191, %cst_4 : tensor<1x64xi32, #mma> loc(#loc333)
222
+ %tmp16_201 = arith.cmpi ne, %tmp16, %cst : tensor<1x64xi32, #mma> loc(#loc334)
223
+ %tmp16_202 = arith.divsi %n_191, %cst_4 : tensor<1x64xi32, #mma> loc(#loc335)
224
+ %tmp16_203 = arith.subi %tmp16_202, %cst_2 : tensor<1x64xi32, #mma> loc(#loc336)
225
+ %tmp16_204 = arith.select %tmp16_201, %tmp16_203, %tmp16_202 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc337)
226
+ %tmp16_205 = arith.select %tmp8, %tmp16_204, %tmp16_202 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc338)
227
+ %tmp17_206 = tt.broadcast %tmp16_205 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc320)
228
+ %tmp17_207 = arith.cmpi eq, %tmp17, %tmp17_206 : tensor<128x64xi32, #mma> loc(#loc320)
229
+ %tmp18 = arith.andi %tmp11_200, %tmp17_207 : tensor<128x64xi1, #mma> loc(#loc339)
230
+ %tmp19 = arith.ori %tmp9_198, %tmp18 : tensor<128x64xi1, #mma> loc(#loc340)
231
+ %tmp20 = arith.ori %tmp6_196, %tmp19 : tensor<128x64xi1, #mma> loc(#loc341)
232
+ %mask_mod_output = arith.select %post_mod_scores_192, %tmp20, %cst_3 : tensor<128x64xi1, #mma>, tensor<128x64xi1, #mma> loc(#loc342)
233
+ %post_mod_scores_208 = arith.select %mask_mod_output, %post_mod_scores_193, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc343)
234
+ %post_mod_scores_209 = arith.mulf %post_mod_scores_208, %cst_17 : tensor<128x64xf32, #mma> loc(#loc344)
235
+ %m_ij = "tt.reduce"(%post_mod_scores_209) <{axis = 1 : i32}> ({
236
+ ^bb0(%m_ij_267: f32 loc(callsite(#loc1 at #loc345)), %m_ij_268: f32 loc(callsite(#loc1 at #loc345))):
237
+ %m_ij_269 = arith.maxnumf %m_ij_267, %m_ij_268 : f32 loc(#loc457)
238
+ tt.reduce.return %m_ij_269 : f32 loc(#loc405)
239
+ }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc405)
240
+ %m_ij_210 = arith.maxnumf %arg18, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc346)
241
+ %masked_out_rows = arith.cmpf oeq, %m_ij_210, %cst_18 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc347)
242
+ %m_ij_masked = arith.select %masked_out_rows, %cst_12, %m_ij_210 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc348)
243
+ %alpha = arith.subf %arg18, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc349)
244
+ %alpha_211 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc350)
245
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc351)
246
+ %p_212 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc352)
247
+ %p_213 = arith.subf %post_mod_scores_209, %p_212 : tensor<128x64xf32, #mma> loc(#loc352)
248
+ %p_214 = math.exp2 %p_213 : tensor<128x64xf32, #mma> loc(#loc353)
249
+ %l_i_215 = arith.mulf %arg17, %alpha_211 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc354)
250
+ %l_i_216 = "tt.reduce"(%p_214) <{axis = 1 : i32}> ({
251
+ ^bb0(%l_i_267: f32 loc(callsite(#loc1 at #loc355)), %l_i_268: f32 loc(callsite(#loc1 at #loc355))):
252
+ %l_i_269 = arith.addf %l_i_267, %l_i_268 : f32 loc(#loc458)
253
+ tt.reduce.return %l_i_269 : f32 loc(#loc407)
254
+ }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc407)
255
+ %l_i_217 = arith.addf %l_i_215, %l_i_216 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc356)
256
+ %acc_218 = tt.expand_dims %alpha_211 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc357)
257
+ %acc_219 = ttg.convert_layout %acc_218 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc358)
258
+ %acc_220 = tt.broadcast %acc_219 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc358)
259
+ %acc_221 = arith.mulf %qk_189#3, %acc_220 : tensor<128x128xf32, #mma1> loc(#loc358)
260
+ %v_222 = ttg.memdesc_index %v[%kv_offset_185] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
261
+ %acc_223 = arith.truncf %p_214 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc359)
262
+ %acc_224 = ttg.convert_layout %acc_223 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc359)
263
+ %acc_225 = ttng.warp_group_dot %acc_224, %v_222, %acc_221 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc360)
264
+ %offs_n_226 = tt.splat %arg27 : i32 -> tensor<1x64xi32, #mma> loc(#loc361)
265
+ %offs_n_227 = arith.addi %arg20, %offs_n_226 : tensor<1x64xi32, #mma> loc(#loc361)
266
+ %kv_offset_228 = arith.addi %kv_offset_173, %c1_i32 : i32 loc(#loc462)
267
+ %cur_block_idx = arith.divsi %kv_offset_228, %c2_i32 : i32 loc(#loc409)
268
+ %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc410)
269
+ %cur_block_229 = tt.load %cur_block, %kv_offset_182 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc411)
270
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc412)
271
+ %next_block_230 = arith.cmpi slt, %next_block, %kv_num_blocks_45 : i32 loc(#loc413)
272
+ %next_block_231 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc414)
273
+ %kv_offset_232 = arith.andi %kv_offset_182, %next_block_230 : i1 loc(#loc462)
274
+ %next_block_233 = tt.load %next_block_231, %kv_offset_232 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc415)
275
+ %needs_jump = arith.addi %kv_offset_173, %c2_i32 : i32 loc(#loc416)
276
+ %needs_jump_234 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc417)
277
+ %needs_jump_235 = arith.cmpi eq, %needs_jump_234, %c0_i32 : i32 loc(#loc418)
278
+ %jump_to_block = arith.subi %next_block_233, %cur_block_229 : i32 loc(#loc419)
279
+ %jump_to_block_236 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc420)
280
+ %jump_to_block_237 = arith.subi %jump_to_block_236, %c64_i32 : i32 loc(#loc421)
281
+ %offset = arith.extui %needs_jump_235 : i1 to i32 loc(#loc422)
282
+ %offset_238 = arith.muli %jump_to_block_237, %offset : i32 loc(#loc422)
283
+ %offset_239 = arith.subi %c1_i32, %offset : i32 loc(#loc423)
284
+ %offset_240 = arith.muli %offset_239, %c64_i32 : i32 loc(#loc424)
285
+ %offset_241 = arith.addi %offset_238, %offset_240 : i32 loc(#loc425)
286
+ %kv_offset_242 = arith.addi %arg19, %offset_241 : i32 loc(#loc363)
287
+ %kv_offset_243 = arith.addi %arg21, %c1_i32 : i32 loc(#loc462)
288
+ %kv_offset_244 = arith.cmpi sge, %kv_offset_243, %c3_i32 : i32 loc(#loc462)
289
+ %kv_offset_245 = arith.select %kv_offset_244, %c0_i32, %kv_offset_243 : i32 loc(#loc462)
290
+ %kv_base_offset_246 = arith.addi %kv_start_44, %kv_offset_242 : i32 loc(#loc324)
291
+ %offs_n_load_247 = tt.splat %kv_base_offset_246 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
292
+ %offs_n_load_248 = arith.addi %offs_n_load_247, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
293
+ %ptr_249 = tt.expand_dims %offs_n_load_248 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
294
+ %ptr_250 = arith.muli %ptr_249, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
295
+ %ptr_251 = tt.addptr %ptr_53, %ptr_250 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
296
+ %ptr_252 = tt.broadcast %ptr_251 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
297
+ %ptr_253 = tt.addptr %ptr_252, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
298
+ %k_254 = arith.cmpi slt, %ptr_249, %k : tensor<64x1xi32, #blocked> loc(#loc395)
299
+ %k_255 = tt.broadcast %k_254 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
300
+ %k_256 = ttg.memdesc_index %k_61[%kv_offset_245] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
301
+ %kv_offset_257 = tt.splat %kv_offset_180 : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
302
+ %kv_offset_258 = arith.andi %kv_offset_257, %k_255 : tensor<64x128xi1, #blocked> loc(#loc462)
303
+ %k_259 = ttg.async_copy_global_to_local %ptr_253, %k_256 mask %kv_offset_258 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
304
+ %k_260 = ttg.async_commit_group tokens %k_259 loc(#loc399)
305
+ %ptr_261 = tt.addptr %ptr_60, %ptr_250 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
306
+ %ptr_262 = tt.broadcast %ptr_261 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
307
+ %ptr_263 = tt.addptr %ptr_262, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
308
+ %v_264 = ttg.memdesc_index %v[%kv_offset_245] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
309
+ %v_265 = ttg.async_copy_global_to_local %ptr_263, %v_264 mask %kv_offset_258 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
310
+ %v_266 = ttg.async_commit_group tokens %v_265 loc(#loc400)
311
+ scf.yield %acc_225, %l_i_217, %m_ij_210, %kv_offset_242, %offs_n_227, %kv_offset_245, %kv_offset_185, %k_176, %k_260, %v_178, %v_266, %offset_241 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc462)
312
+ } loc(#loc462)
313
+ %kv_offset_103 = ttng.warp_group_dot_wait %kv_offset_102#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc462)
314
+ %kv_offset_104 = ttg.async_wait {num = 0 : i32} loc(#loc462)
315
+ ttg.local_dealloc %v : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc462)
316
+ ttg.local_dealloc %k_61 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc462)
317
+ %kv_indices_105 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc275)
318
+ %kv_start_106 = tt.load %kv_indices_105 : !tt.ptr<i32> loc(#loc276)
319
+ %kv_start_107 = arith.muli %kv_start_106, %c128_i32 : i32 loc(#loc277)
320
+ %kv_num_blocks_108 = tt.addptr %arg_FULL_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc278)
321
+ %kv_num_blocks_109 = tt.load %kv_num_blocks_108 : !tt.ptr<i32> loc(#loc279)
322
+ %block_n_end_110 = arith.muli %kv_num_blocks_109, %c2_i32 : i32 loc(#loc280)
323
+ %block_n_end_111 = arith.minsi %block_n_end_110, %block_n_end_48 : i32 loc(#loc281)
324
+ %offs_n_112 = tt.splat %kv_start_107 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc282)
325
+ %offs_n_113 = arith.addi %offs_n_112, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc282)
326
+ %2 = tt.expand_dims %offs_n_113 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc130)
327
+ %k_114 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc426)
328
+ %v_115 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc427)
329
+ %kv_offset_116 = arith.cmpi sgt, %block_n_end_111, %c0_i32 : i32 loc(#loc463)
330
+ %offs_n_load_117 = tt.splat %kv_start_107 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
331
+ %offs_n_load_118 = arith.addi %offs_n_load_117, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
332
+ %ptr_119 = tt.expand_dims %offs_n_load_118 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
333
+ %ptr_120 = arith.muli %ptr_119, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
334
+ %ptr_121 = tt.addptr %ptr_53, %ptr_120 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
335
+ %ptr_122 = tt.broadcast %ptr_121 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
336
+ %ptr_123 = tt.addptr %ptr_122, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
337
+ %k_124 = arith.cmpi slt, %ptr_119, %k : tensor<64x1xi32, #blocked> loc(#loc432)
338
+ %k_125 = tt.broadcast %k_124 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
339
+ %k_126 = ttg.memdesc_index %k_114[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
340
+ %kv_offset_127 = tt.splat %kv_offset_116 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
341
+ %kv_offset_128 = arith.andi %kv_offset_127, %k_125 : tensor<64x128xi1, #blocked> loc(#loc463)
342
+ %k_129 = ttg.async_copy_global_to_local %ptr_123, %k_126 mask %kv_offset_128 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
343
+ %k_130 = ttg.async_commit_group tokens %k_129 loc(#loc426)
344
+ %ptr_131 = tt.addptr %ptr_60, %ptr_120 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
345
+ %ptr_132 = tt.broadcast %ptr_131 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
346
+ %ptr_133 = tt.addptr %ptr_132, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
347
+ %v_134 = ttg.memdesc_index %v_115[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
348
+ %v_135 = ttg.async_copy_global_to_local %ptr_133, %v_134 mask %kv_offset_128 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
349
+ %v_136 = ttg.async_commit_group tokens %v_135 loc(#loc427)
350
+ %kv_offset_137 = arith.cmpi sgt, %block_n_end_111, %c1_i32 : i32 loc(#loc463)
351
+ %kv_base_offset_138 = arith.addi %kv_start_107, %c64_i32 : i32 loc(#loc367)
352
+ %offs_n_load_139 = tt.splat %kv_base_offset_138 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
353
+ %offs_n_load_140 = arith.addi %offs_n_load_139, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
354
+ %ptr_141 = tt.expand_dims %offs_n_load_140 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
355
+ %ptr_142 = arith.muli %ptr_141, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
356
+ %ptr_143 = tt.addptr %ptr_53, %ptr_142 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
357
+ %ptr_144 = tt.broadcast %ptr_143 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
358
+ %ptr_145 = tt.addptr %ptr_144, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
359
+ %k_146 = arith.cmpi slt, %ptr_141, %k : tensor<64x1xi32, #blocked> loc(#loc432)
360
+ %k_147 = tt.broadcast %k_146 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
361
+ %k_148 = ttg.memdesc_index %k_114[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
362
+ %kv_offset_149 = tt.splat %kv_offset_137 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
363
+ %kv_offset_150 = arith.andi %kv_offset_149, %k_147 : tensor<64x128xi1, #blocked> loc(#loc463)
364
+ %k_151 = ttg.async_copy_global_to_local %ptr_145, %k_148 mask %kv_offset_150 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
365
+ %k_152 = ttg.async_commit_group tokens %k_151 loc(#loc426)
366
+ %ptr_153 = tt.addptr %ptr_60, %ptr_142 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
367
+ %ptr_154 = tt.broadcast %ptr_153 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
368
+ %ptr_155 = tt.addptr %ptr_154, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
369
+ %v_156 = ttg.memdesc_index %v_115[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
370
+ %v_157 = ttg.async_copy_global_to_local %ptr_155, %v_156 mask %kv_offset_150 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
371
+ %v_158 = ttg.async_commit_group tokens %v_157 loc(#loc427)
372
+ ttng.fence_async_shared {bCluster = false} loc(#loc368)
373
+ %kv_offset_159:12 = scf.for %kv_offset_173 = %c0_i32 to %block_n_end_111 step %c1_i32 iter_args(%kv_offset_174 = %kv_offset_103, %kv_offset_175 = %kv_offset_102#1, %kv_offset_176 = %kv_offset_102#2, %arg19 = %c64_i32, %arg20 = %2, %arg21 = %c1_i32, %arg22 = %c-1_i32, %k_177 = %k_130, %k_178 = %k_152, %v_179 = %v_136, %v_180 = %v_158, %arg27 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 {
374
+ %kv_offset_181 = arith.subi %block_n_end_111, %c2_i32 : i32 loc(#loc463)
375
+ %kv_offset_182 = arith.cmpi slt, %kv_offset_173, %kv_offset_181 : i32 loc(#loc463)
376
+ %kv_offset_183 = arith.subi %block_n_end_111, %c1_i32 : i32 loc(#loc463)
377
+ %kv_offset_184 = arith.cmpi slt, %kv_offset_173, %kv_offset_183 : i32 loc(#loc463)
378
+ %kv_offset_185 = arith.addi %arg22, %c1_i32 : i32 loc(#loc463)
379
+ %kv_offset_186 = arith.cmpi sge, %kv_offset_185, %c3_i32 : i32 loc(#loc463)
380
+ %kv_offset_187 = arith.select %kv_offset_186, %c0_i32, %kv_offset_185 : i32 loc(#loc463)
381
+ %k_188 = ttg.async_wait %k_177, %v_179 {num = 2 : i32} loc(#loc426)
382
+ %k_189 = ttg.memdesc_index %k_114[%kv_offset_187] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
383
+ %k_190 = ttg.memdesc_trans %k_189 {order = array<i32: 1, 0>} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc369)
384
+ %qk = ttng.warp_group_dot %q_43, %k_190, %cst_6 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc368)
385
+ %qk_191:4 = ttng.warp_group_dot_wait %qk, %q_43, %k_190, %kv_offset_174 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc368)
386
+ %qk_192 = arith.mulf %qk_191#0, %cst_15 : tensor<128x64xf32, #mma> loc(#loc370)
387
+ %post_mod_scores = arith.cmpi slt, %arg20, %n : tensor<1x64xi32, #mma> loc(#loc371)
388
+ %post_mod_scores_193 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc372)
389
+ %post_mod_scores_194 = arith.select %post_mod_scores_193, %qk_192, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc372)
390
+ %post_mod_scores_195 = arith.mulf %post_mod_scores_194, %cst_17 : tensor<128x64xf32, #mma> loc(#loc373)
391
+ %m_ij = "tt.reduce"(%post_mod_scores_195) <{axis = 1 : i32}> ({
392
+ ^bb0(%m_ij_253: f32 loc(callsite(#loc1 at #loc374)), %m_ij_254: f32 loc(callsite(#loc1 at #loc374))):
393
+ %m_ij_255 = arith.maxnumf %m_ij_253, %m_ij_254 : f32 loc(#loc459)
394
+ tt.reduce.return %m_ij_255 : f32 loc(#loc435)
395
+ }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc435)
396
+ %m_ij_196 = arith.maxnumf %kv_offset_176, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc375)
397
+ %masked_out_rows = arith.cmpf oeq, %m_ij_196, %cst_18 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc376)
398
+ %m_ij_masked = arith.select %masked_out_rows, %cst_12, %m_ij_196 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc377)
399
+ %alpha = arith.subf %kv_offset_176, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc378)
400
+ %alpha_197 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc379)
401
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc380)
402
+ %p_198 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc381)
403
+ %p_199 = arith.subf %post_mod_scores_195, %p_198 : tensor<128x64xf32, #mma> loc(#loc381)
404
+ %p_200 = math.exp2 %p_199 : tensor<128x64xf32, #mma> loc(#loc382)
405
+ %l_i_201 = arith.mulf %kv_offset_175, %alpha_197 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc383)
406
+ %l_i_202 = "tt.reduce"(%p_200) <{axis = 1 : i32}> ({
407
+ ^bb0(%l_i_253: f32 loc(callsite(#loc1 at #loc384)), %l_i_254: f32 loc(callsite(#loc1 at #loc384))):
408
+ %l_i_255 = arith.addf %l_i_253, %l_i_254 : f32 loc(#loc460)
409
+ tt.reduce.return %l_i_255 : f32 loc(#loc437)
410
+ }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc437)
411
+ %l_i_203 = arith.addf %l_i_201, %l_i_202 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc385)
412
+ %acc_204 = tt.expand_dims %alpha_197 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc386)
413
+ %acc_205 = ttg.convert_layout %acc_204 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc387)
414
+ %acc_206 = tt.broadcast %acc_205 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc387)
415
+ %acc_207 = arith.mulf %qk_191#3, %acc_206 : tensor<128x128xf32, #mma1> loc(#loc387)
416
+ %v_208 = ttg.memdesc_index %v_115[%kv_offset_187] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
417
+ %acc_209 = arith.truncf %p_200 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc388)
418
+ %acc_210 = ttg.convert_layout %acc_209 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc388)
419
+ %acc_211 = ttng.warp_group_dot %acc_210, %v_208, %acc_207 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc389)
420
+ %offs_n_212 = tt.splat %arg27 : i32 -> tensor<1x64xi32, #mma> loc(#loc390)
421
+ %offs_n_213 = arith.addi %arg20, %offs_n_212 : tensor<1x64xi32, #mma> loc(#loc390)
422
+ %kv_offset_214 = arith.addi %kv_offset_173, %c1_i32 : i32 loc(#loc463)
423
+ %cur_block_idx = arith.divsi %kv_offset_214, %c2_i32 : i32 loc(#loc439)
424
+ %cur_block = tt.addptr %kv_indices_105, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc440)
425
+ %cur_block_215 = tt.load %cur_block, %kv_offset_184 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc441)
426
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc442)
427
+ %next_block_216 = arith.cmpi slt, %next_block, %kv_num_blocks_109 : i32 loc(#loc443)
428
+ %next_block_217 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc444)
429
+ %kv_offset_218 = arith.andi %kv_offset_184, %next_block_216 : i1 loc(#loc463)
430
+ %next_block_219 = tt.load %next_block_217, %kv_offset_218 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc445)
431
+ %needs_jump = arith.addi %kv_offset_173, %c2_i32 : i32 loc(#loc446)
432
+ %needs_jump_220 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc447)
433
+ %needs_jump_221 = arith.cmpi eq, %needs_jump_220, %c0_i32 : i32 loc(#loc448)
434
+ %jump_to_block = arith.subi %next_block_219, %cur_block_215 : i32 loc(#loc449)
435
+ %jump_to_block_222 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc450)
436
+ %jump_to_block_223 = arith.subi %jump_to_block_222, %c64_i32 : i32 loc(#loc451)
437
+ %offset = arith.extui %needs_jump_221 : i1 to i32 loc(#loc452)
438
+ %offset_224 = arith.muli %jump_to_block_223, %offset : i32 loc(#loc452)
439
+ %offset_225 = arith.subi %c1_i32, %offset : i32 loc(#loc453)
440
+ %offset_226 = arith.muli %offset_225, %c64_i32 : i32 loc(#loc454)
441
+ %offset_227 = arith.addi %offset_224, %offset_226 : i32 loc(#loc455)
442
+ %kv_offset_228 = arith.addi %arg19, %offset_227 : i32 loc(#loc392)
443
+ %kv_offset_229 = arith.addi %arg21, %c1_i32 : i32 loc(#loc463)
444
+ %kv_offset_230 = arith.cmpi sge, %kv_offset_229, %c3_i32 : i32 loc(#loc463)
445
+ %kv_offset_231 = arith.select %kv_offset_230, %c0_i32, %kv_offset_229 : i32 loc(#loc463)
446
+ %kv_base_offset_232 = arith.addi %kv_start_107, %kv_offset_228 : i32 loc(#loc367)
447
+ %offs_n_load_233 = tt.splat %kv_base_offset_232 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
448
+ %offs_n_load_234 = arith.addi %offs_n_load_233, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
449
+ %ptr_235 = tt.expand_dims %offs_n_load_234 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
450
+ %ptr_236 = arith.muli %ptr_235, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
451
+ %ptr_237 = tt.addptr %ptr_53, %ptr_236 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
452
+ %ptr_238 = tt.broadcast %ptr_237 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
453
+ %ptr_239 = tt.addptr %ptr_238, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
454
+ %k_240 = arith.cmpi slt, %ptr_235, %k : tensor<64x1xi32, #blocked> loc(#loc432)
455
+ %k_241 = tt.broadcast %k_240 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
456
+ %k_242 = ttg.memdesc_index %k_114[%kv_offset_231] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
457
+ %kv_offset_243 = tt.splat %kv_offset_182 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
458
+ %kv_offset_244 = arith.andi %kv_offset_243, %k_241 : tensor<64x128xi1, #blocked> loc(#loc463)
459
+ %k_245 = ttg.async_copy_global_to_local %ptr_239, %k_242 mask %kv_offset_244 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
460
+ %k_246 = ttg.async_commit_group tokens %k_245 loc(#loc426)
461
+ %ptr_247 = tt.addptr %ptr_60, %ptr_236 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
462
+ %ptr_248 = tt.broadcast %ptr_247 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
463
+ %ptr_249 = tt.addptr %ptr_248, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
464
+ %v_250 = ttg.memdesc_index %v_115[%kv_offset_231] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
465
+ %v_251 = ttg.async_copy_global_to_local %ptr_249, %v_250 mask %kv_offset_244 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
466
+ %v_252 = ttg.async_commit_group tokens %v_251 loc(#loc427)
467
+ scf.yield %acc_211, %l_i_203, %m_ij_196, %kv_offset_228, %offs_n_213, %kv_offset_231, %kv_offset_187, %k_178, %k_246, %v_180, %v_252, %offset_227 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc463)
468
+ } loc(#loc463)
469
+ %kv_offset_160 = ttng.warp_group_dot_wait %kv_offset_159#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc463)
470
+ %kv_offset_161 = ttg.async_wait {num = 0 : i32} loc(#loc463)
471
+ ttg.local_dealloc %v_115 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc463)
472
+ ttg.local_dealloc %k_114 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc463)
473
+ %l_i = arith.cmpf oeq, %kv_offset_159#1, %cst_12 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc284)
474
+ %l_i_162 = arith.select %l_i, %cst_13, %kv_offset_159#1 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc285)
475
+ %acc = tt.expand_dims %l_i_162 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc286)
476
+ %acc_163 = ttg.convert_layout %acc : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc287)
477
+ %acc_164 = tt.broadcast %acc_163 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc287)
478
+ %acc_165 = arith.divf %kv_offset_160, %acc_164 : tensor<128x128xf32, #mma1> loc(#loc287)
479
+ %mask = arith.cmpi slt, %ptr_35, %cst_9 : tensor<1x128xi32, #blocked> loc(#loc288)
480
+ %mask_166 = tt.broadcast %mask : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc289)
481
+ %mask_167 = arith.andi %q_41, %mask_166 : tensor<128x128xi1, #blocked> loc(#loc289)
482
+ %3 = tt.splat %q_offset_19 : i32 -> tensor<1x128xi32, #blocked> loc(#loc138)
483
+ %4 = arith.addi %ptr_35, %3 : tensor<1x128xi32, #blocked> loc(#loc138)
484
+ %5 = tt.broadcast %4 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc139)
485
+ %6 = tt.broadcast %ptr_31 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc139)
486
+ %7 = arith.addi %5, %6 : tensor<128x128xi32, #blocked> loc(#loc139)
487
+ %8 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc140)
488
+ %9 = tt.addptr %8, %7 : tensor<128x128x!tt.ptr<bf16>, #blocked>, tensor<128x128xi32, #blocked> loc(#loc140)
489
+ %10 = arith.truncf %acc_165 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc141)
490
+ %11 = ttg.convert_layout %10 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc141)
491
+ tt.store %9, %11, %mask_167 : tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc141)
492
+ %off_hz = arith.muli %off_zq, %c32_i32 : i32 loc(#loc290)
493
+ %off_hz_168 = arith.addi %off_hz, %off_hq : i32 loc(#loc291)
494
+ %l_ptrs = arith.muli %off_hz_168, %ks0 : i32 loc(#loc292)
495
+ %l_ptrs_169 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr<f32>, i32 loc(#loc293)
496
+ %l_ptrs_170 = tt.splat %l_ptrs_169 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked1> loc(#loc294)
497
+ %l_ptrs_171 = tt.addptr %l_ptrs_170, %offs_m_29 : tensor<128x!tt.ptr<f32>, #blocked1>, tensor<128xi32, #blocked1> loc(#loc294)
498
+ %lse = math.log2 %l_i_162 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc295)
499
+ %lse_172 = arith.addf %kv_offset_159#2, %lse : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc296)
500
+ %12 = tt.splat %ks0 : i32 -> tensor<128xi32, #blocked1> loc(#loc149)
501
+ %13 = arith.cmpi slt, %offs_m_29, %12 : tensor<128xi32, #blocked1> loc(#loc149)
502
+ %14 = ttg.convert_layout %lse_172 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128xf32, #blocked1> loc(#loc150)
503
+ tt.store %l_ptrs_171, %14, %13 : tensor<128x!tt.ptr<f32>, #blocked1> loc(#loc150)
504
+ tt.return loc(#loc151)
505
+ } loc(#loc)
506
+ } loc(#loc)
507
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":85:54)
508
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":97:28)
509
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":98:27)
510
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":99:27)
511
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":104:24)
512
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:24)
513
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:45)
514
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:36)
515
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":108:47)
516
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":111:12)
517
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":112:12)
518
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":113:12)
519
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":143:97)
520
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:23)
521
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:46)
522
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:33)
523
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:27)
524
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":146:101)
525
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:38)
526
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:20)
527
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:56)
528
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:49)
529
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":292:52)
530
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":292:23)
531
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":151:26)
532
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":152:23)
533
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":152:37)
534
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":153:42)
535
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":153:28)
536
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:45)
537
+ #loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22)
538
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:92)
539
+ #loc34 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28)
540
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:102)
541
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:65)
542
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":159:37)
543
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":159:24)
544
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":167:48)
545
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":347:107)
546
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":257:21)
547
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":358:36)
548
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":359:36)
549
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":372:22)
550
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":374:23)
551
+ #loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":375:22)
552
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":376:23)
553
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":378:22)
554
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:70)
555
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:79)
556
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:91)
557
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:99)
558
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:102)
559
+ #loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:119)
560
+ #loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":386:25)
561
+ #loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":421:107)
562
+ #loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":484:40)
563
+ #loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":346:35)
564
+ #loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":342:32)
565
+ #loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":351:19)
566
+ #loc63 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":349:17)
567
+ #loc64 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":353:14)
568
+ #loc65 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":367:44)
569
+ #loc66 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":367:69)
570
+ #loc67 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":377:22)
571
+ #loc68 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":379:24)
572
+ #loc69 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":380:23)
573
+ #loc70 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:70)
574
+ #loc71 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:79)
575
+ #loc72 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:91)
576
+ #loc73 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:99)
577
+ #loc74 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:102)
578
+ #loc75 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:119)
579
+ #loc76 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":387:24)
580
+ #loc77 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":388:23)
581
+ #loc78 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":389:23)
582
+ #loc79 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":394:73)
583
+ #loc80 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":396:69)
584
+ #loc81 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":399:27)
585
+ #loc82 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40)
586
+ #loc84 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27)
587
+ #loc85 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":403:27)
588
+ #loc86 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":405:35)
589
+ #loc87 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":406:51)
590
+ #loc88 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":410:31)
591
+ #loc89 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":410:25)
592
+ #loc90 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:51)
593
+ #loc91 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:39)
594
+ #loc92 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:21)
595
+ #loc93 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:16)
596
+ #loc94 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
597
+ #loc96 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
598
+ #loc97 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:24)
599
+ #loc98 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":418:22)
600
+ #loc99 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":418:16)
601
+ #loc100 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":422:22)
602
+ #loc101 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":422:44)
603
+ #loc102 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":530:26)
604
+ #loc103 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":247:33)
605
+ #loc104 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":527:63)
606
+ #loc105 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":248:38)
607
+ #loc106 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":248:24)
608
+ #loc107 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:109)
609
+ #loc108 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:113)
610
+ #loc109 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:55)
611
+ #loc110 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:25)
612
+ #loc111 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:30)
613
+ #loc112 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:35)
614
+ #loc113 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:60)
615
+ #loc114 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:34)
616
+ #loc115 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:48)
617
+ #loc116 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:63)
618
+ #loc117 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:29)
619
+ #loc118 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:47)
620
+ #loc119 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:61)
621
+ #loc120 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:42)
622
+ #loc121 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":531:21)
623
+ #loc122 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":181:35)
624
+ #loc123 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":182:27)
625
+ #loc124 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":182:41)
626
+ #loc125 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":183:51)
627
+ #loc126 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":183:32)
628
+ #loc127 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":184:49)
629
+ #loc128 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":184:69)
630
+ #loc129 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":186:28)
631
+ #loc130 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":193:52)
632
+ #loc132 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":206:26)
633
+ #loc133 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":206:34)
634
+ #loc134 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":208:20)
635
+ #loc135 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":208:16)
636
+ #loc136 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":214:38)
637
+ #loc137 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":214:30)
638
+ #loc138 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:49)
639
+ #loc139 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:62)
640
+ #loc140 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:25)
641
+ #loc141 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:92)
642
+ #loc142 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":221:26)
643
+ #loc143 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":221:31)
644
+ #loc144 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:32)
645
+ #loc145 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:23)
646
+ #loc146 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:40)
647
+ #loc147 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":223:33)
648
+ #loc148 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":223:20)
649
+ #loc149 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":227:48)
650
+ #loc150 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":227:29)
651
+ #loc151 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":229:4)
652
+ #loc167 = loc("q_start"(#loc3))
653
+ #loc168 = loc("off_zq"(#loc4))
654
+ #loc169 = loc("off_hq"(#loc5))
655
+ #loc170 = loc("off_hkv"(#loc6))
656
+ #loc171 = loc("q_offset"(#loc7))
657
+ #loc172 = loc("q_offset"(#loc8))
658
+ #loc173 = loc("q_offset"(#loc9))
659
+ #loc174 = loc("k_offset"(#loc10))
660
+ #loc175 = loc("Q"(#loc11))
661
+ #loc176 = loc("K"(#loc12))
662
+ #loc177 = loc("V"(#loc13))
663
+ #loc178 = loc("sparse_kv_idx_offset"(#loc14))
664
+ #loc179 = loc("offs_m"(#loc15))
665
+ #loc180 = loc("offs_m"(#loc16))
666
+ #loc181 = loc("offs_m"(#loc17))
667
+ #loc182 = loc("ptr"(#loc18))
668
+ #loc183 = loc("q"(#loc19))
669
+ #loc184 = loc("ptr"(#loc20))
670
+ #loc185 = loc("ptr"(#loc21))
671
+ #loc186 = loc("ptr"(#loc22))
672
+ #loc187 = loc("ptr"(#loc23))
673
+ #loc188 = loc("kv_indices"(#loc26))
674
+ #loc189 = loc("kv_start"(#loc27))
675
+ #loc190 = loc("kv_start"(#loc28))
676
+ #loc191 = loc("kv_num_blocks"(#loc29))
677
+ #loc192 = loc("kv_num_blocks"(#loc30))
678
+ #loc193 = loc("block_n_end"(#loc31))
679
+ #loc194 = loc("block_n_end"(#loc33))
680
+ #loc195 = loc("block_n_end"(#loc35))
681
+ #loc196 = loc("block_n_end"(#loc36))
682
+ #loc197 = loc("offs_n"(#loc37))
683
+ #loc198 = loc("offs_n"(#loc38))
684
+ #loc199 = loc("k"(#loc40))
685
+ #loc201 = loc("m"(#loc44))
686
+ #loc202 = loc("n"(#loc45))
687
+ #loc203 = loc("tmp3"(#loc46))
688
+ #loc204 = loc("tmp5"(#loc47))
689
+ #loc205 = loc("tmp6"(#loc48))
690
+ #loc206 = loc("tmp7"(#loc49))
691
+ #loc207 = loc("tmp9"(#loc50))
692
+ #loc208 = loc("tmp14"(#loc51))
693
+ #loc209 = loc("tmp14"(#loc52))
694
+ #loc210 = loc("tmp14"(#loc53))
695
+ #loc211 = loc("tmp14"(#loc54))
696
+ #loc212 = loc("tmp14"(#loc55))
697
+ #loc213 = loc("tmp14"(#loc56))
698
+ #loc214 = loc("tmp17"(#loc57))
699
+ #loc215 = loc("v"(#loc58))
700
+ #loc216 = loc("acc"(#loc59))
701
+ #loc217 = loc("offs_n_load"(#loc60))
702
+ #loc218 = loc("kv_base_offset"(#loc61))
703
+ #loc219 = loc("qk"(#loc62))
704
+ #loc220 = loc("k"(#loc63))
705
+ #loc221 = loc("qk"(#loc64))
706
+ #loc222 = loc("post_mod_scores"(#loc65))
707
+ #loc223 = loc("post_mod_scores"(#loc66))
708
+ #loc224 = loc("tmp8"(#loc67))
709
+ #loc225 = loc("tmp10"(#loc68))
710
+ #loc226 = loc("tmp11"(#loc69))
711
+ #loc227 = loc("tmp16"(#loc70))
712
+ #loc228 = loc("tmp16"(#loc71))
713
+ #loc229 = loc("tmp16"(#loc72))
714
+ #loc230 = loc("tmp16"(#loc73))
715
+ #loc231 = loc("tmp16"(#loc74))
716
+ #loc232 = loc("tmp16"(#loc75))
717
+ #loc233 = loc("tmp18"(#loc76))
718
+ #loc234 = loc("tmp19"(#loc77))
719
+ #loc235 = loc("tmp20"(#loc78))
720
+ #loc236 = loc("mask_mod_output"(#loc79))
721
+ #loc237 = loc("post_mod_scores"(#loc80))
722
+ #loc238 = loc("post_mod_scores"(#loc81))
723
+ #loc240 = loc("m_ij"(#loc85))
724
+ #loc241 = loc("masked_out_rows"(#loc86))
725
+ #loc242 = loc("m_ij_masked"(#loc87))
726
+ #loc243 = loc("alpha"(#loc88))
727
+ #loc244 = loc("alpha"(#loc89))
728
+ #loc245 = loc("p"(#loc90))
729
+ #loc246 = loc("p"(#loc91))
730
+ #loc247 = loc("p"(#loc92))
731
+ #loc248 = loc("l_i"(#loc93))
732
+ #loc250 = loc("l_i"(#loc97))
733
+ #loc251 = loc("acc"(#loc98))
734
+ #loc252 = loc("acc"(#loc99))
735
+ #loc253 = loc("acc"(#loc100))
736
+ #loc254 = loc("acc"(#loc101))
737
+ #loc255 = loc("offs_n"(#loc102))
738
+ #loc256 = loc("cur_block_idx"(#loc103))
739
+ #loc257 = loc("offset"(#loc104))
740
+ #loc258 = loc("cur_block"(#loc105))
741
+ #loc259 = loc("cur_block"(#loc106))
742
+ #loc260 = loc("next_block"(#loc107))
743
+ #loc261 = loc("next_block"(#loc108))
744
+ #loc262 = loc("next_block"(#loc109))
745
+ #loc263 = loc("next_block"(#loc110))
746
+ #loc264 = loc("needs_jump"(#loc111))
747
+ #loc265 = loc("needs_jump"(#loc112))
748
+ #loc266 = loc("needs_jump"(#loc113))
749
+ #loc267 = loc("jump_to_block"(#loc114))
750
+ #loc268 = loc("jump_to_block"(#loc115))
751
+ #loc269 = loc("jump_to_block"(#loc116))
752
+ #loc270 = loc("offset"(#loc117))
753
+ #loc271 = loc("offset"(#loc118))
754
+ #loc272 = loc("offset"(#loc119))
755
+ #loc273 = loc("offset"(#loc120))
756
+ #loc274 = loc("kv_offset"(#loc121))
757
+ #loc275 = loc("kv_indices"(#loc122))
758
+ #loc276 = loc("kv_start"(#loc123))
759
+ #loc277 = loc("kv_start"(#loc124))
760
+ #loc278 = loc("kv_num_blocks"(#loc125))
761
+ #loc279 = loc("kv_num_blocks"(#loc126))
762
+ #loc280 = loc("block_n_end"(#loc127))
763
+ #loc281 = loc("block_n_end"(#loc128))
764
+ #loc282 = loc("offs_n"(#loc129))
765
+ #loc284 = loc("l_i"(#loc132))
766
+ #loc285 = loc("l_i"(#loc133))
767
+ #loc286 = loc("acc"(#loc134))
768
+ #loc287 = loc("acc"(#loc135))
769
+ #loc288 = loc("mask"(#loc136))
770
+ #loc289 = loc("mask"(#loc137))
771
+ #loc290 = loc("off_hz"(#loc142))
772
+ #loc291 = loc("off_hz"(#loc143))
773
+ #loc292 = loc("l_ptrs"(#loc144))
774
+ #loc293 = loc("l_ptrs"(#loc145))
775
+ #loc294 = loc("l_ptrs"(#loc146))
776
+ #loc295 = loc("lse"(#loc147))
777
+ #loc296 = loc("lse"(#loc148))
778
+ #loc297 = loc(callsite(#loc182 at #loc183))
779
+ #loc298 = loc(callsite(#loc184 at #loc183))
780
+ #loc299 = loc(callsite(#loc185 at #loc183))
781
+ #loc300 = loc(callsite(#loc186 at #loc183))
782
+ #loc301 = loc(callsite(#loc187 at #loc183))
783
+ #loc302 = loc(callsite(#loc24 at #loc183))
784
+ #loc303 = loc(callsite(#loc25 at #loc183))
785
+ #loc304 = loc(callsite(#loc32 at #loc194))
786
+ #loc305 = loc(callsite(#loc34 at #loc194))
787
+ #loc306 = loc(callsite(#loc199 at #loc200))
788
+ #loc307 = loc(callsite(#loc201 at #loc200))
789
+ #loc308 = loc(callsite(#loc202 at #loc200))
790
+ #loc309 = loc(callsite(#loc203 at #loc200))
791
+ #loc310 = loc(callsite(#loc204 at #loc200))
792
+ #loc311 = loc(callsite(#loc205 at #loc200))
793
+ #loc312 = loc(callsite(#loc206 at #loc200))
794
+ #loc313 = loc(callsite(#loc207 at #loc200))
795
+ #loc314 = loc(callsite(#loc208 at #loc200))
796
+ #loc315 = loc(callsite(#loc209 at #loc200))
797
+ #loc316 = loc(callsite(#loc210 at #loc200))
798
+ #loc317 = loc(callsite(#loc211 at #loc200))
799
+ #loc318 = loc(callsite(#loc212 at #loc200))
800
+ #loc319 = loc(callsite(#loc213 at #loc200))
801
+ #loc320 = loc(callsite(#loc214 at #loc200))
802
+ #loc321 = loc(callsite(#loc215 at #loc200))
803
+ #loc322 = loc("l_i"(#loc216))
804
+ #loc323 = loc(callsite(#loc217 at #loc200))
805
+ #loc324 = loc(callsite(#loc218 at #loc200))
806
+ #loc325 = loc(callsite(#loc219 at #loc200))
807
+ #loc326 = loc(callsite(#loc220 at #loc200))
808
+ #loc327 = loc(callsite(#loc221 at #loc200))
809
+ #loc328 = loc(callsite(#loc222 at #loc200))
810
+ #loc329 = loc(callsite(#loc223 at #loc200))
811
+ #loc330 = loc(callsite(#loc224 at #loc200))
812
+ #loc331 = loc(callsite(#loc225 at #loc200))
813
+ #loc332 = loc(callsite(#loc226 at #loc200))
814
+ #loc333 = loc(callsite(#loc227 at #loc200))
815
+ #loc334 = loc(callsite(#loc228 at #loc200))
816
+ #loc335 = loc(callsite(#loc229 at #loc200))
817
+ #loc336 = loc(callsite(#loc230 at #loc200))
818
+ #loc337 = loc(callsite(#loc231 at #loc200))
819
+ #loc338 = loc(callsite(#loc232 at #loc200))
820
+ #loc339 = loc(callsite(#loc233 at #loc200))
821
+ #loc340 = loc(callsite(#loc234 at #loc200))
822
+ #loc341 = loc(callsite(#loc235 at #loc200))
823
+ #loc342 = loc(callsite(#loc236 at #loc200))
824
+ #loc343 = loc(callsite(#loc237 at #loc200))
825
+ #loc344 = loc(callsite(#loc238 at #loc200))
826
+ #loc346 = loc(callsite(#loc240 at #loc200))
827
+ #loc347 = loc(callsite(#loc241 at #loc200))
828
+ #loc348 = loc(callsite(#loc242 at #loc200))
829
+ #loc349 = loc(callsite(#loc243 at #loc200))
830
+ #loc350 = loc(callsite(#loc244 at #loc200))
831
+ #loc351 = loc(callsite(#loc245 at #loc200))
832
+ #loc352 = loc(callsite(#loc246 at #loc200))
833
+ #loc353 = loc(callsite(#loc247 at #loc200))
834
+ #loc354 = loc(callsite(#loc248 at #loc200))
835
+ #loc356 = loc(callsite(#loc250 at #loc200))
836
+ #loc357 = loc(callsite(#loc251 at #loc200))
837
+ #loc358 = loc(callsite(#loc252 at #loc200))
838
+ #loc359 = loc(callsite(#loc253 at #loc200))
839
+ #loc360 = loc(callsite(#loc254 at #loc200))
840
+ #loc361 = loc(callsite(#loc255 at #loc42))
841
+ #loc362 = loc(callsite(#loc257 at #loc42))
842
+ #loc363 = loc(callsite(#loc274 at #loc42))
843
+ #loc364 = loc(callsite(#loc199 at #loc283))
844
+ #loc365 = loc(callsite(#loc215 at #loc283))
845
+ #loc366 = loc(callsite(#loc217 at #loc283))
846
+ #loc367 = loc(callsite(#loc218 at #loc283))
847
+ #loc368 = loc(callsite(#loc219 at #loc283))
848
+ #loc369 = loc(callsite(#loc220 at #loc283))
849
+ #loc370 = loc(callsite(#loc221 at #loc283))
850
+ #loc371 = loc(callsite(#loc222 at #loc283))
851
+ #loc372 = loc(callsite(#loc223 at #loc283))
852
+ #loc373 = loc(callsite(#loc238 at #loc283))
853
+ #loc375 = loc(callsite(#loc240 at #loc283))
854
+ #loc376 = loc(callsite(#loc241 at #loc283))
855
+ #loc377 = loc(callsite(#loc242 at #loc283))
856
+ #loc378 = loc(callsite(#loc243 at #loc283))
857
+ #loc379 = loc(callsite(#loc244 at #loc283))
858
+ #loc380 = loc(callsite(#loc245 at #loc283))
859
+ #loc381 = loc(callsite(#loc246 at #loc283))
860
+ #loc382 = loc(callsite(#loc247 at #loc283))
861
+ #loc383 = loc(callsite(#loc248 at #loc283))
862
+ #loc385 = loc(callsite(#loc250 at #loc283))
863
+ #loc386 = loc(callsite(#loc251 at #loc283))
864
+ #loc387 = loc(callsite(#loc252 at #loc283))
865
+ #loc388 = loc(callsite(#loc253 at #loc283))
866
+ #loc389 = loc(callsite(#loc254 at #loc283))
867
+ #loc390 = loc(callsite(#loc255 at #loc131))
868
+ #loc391 = loc(callsite(#loc257 at #loc131))
869
+ #loc392 = loc(callsite(#loc274 at #loc131))
870
+ #loc393 = loc(callsite(#loc185 at #loc306))
871
+ #loc394 = loc(callsite(#loc187 at #loc306))
872
+ #loc395 = loc(callsite(#loc24 at #loc306))
873
+ #loc396 = loc(callsite(#loc43 at #loc307))
874
+ #loc397 = loc(callsite(#loc43 at #loc308))
875
+ #loc398 = loc(callsite(#loc185 at #loc321))
876
+ #loc399 = loc(callsite(#loc25 at #loc306))
877
+ #loc400 = loc(callsite(#loc25 at #loc321))
878
+ #loc401 = loc("m_i"(#loc322))
879
+ #loc402 = loc(callsite(#loc182 at #loc306))
880
+ #loc403 = loc(callsite(#loc184 at #loc306))
881
+ #loc404 = loc(callsite(#loc187 at #loc321))
882
+ #loc405 = loc(callsite(#loc82 at #loc345))
883
+ #loc407 = loc(callsite(#loc94 at #loc355))
884
+ #loc409 = loc(callsite(#loc256 at #loc362))
885
+ #loc410 = loc(callsite(#loc258 at #loc362))
886
+ #loc411 = loc(callsite(#loc259 at #loc362))
887
+ #loc412 = loc(callsite(#loc260 at #loc362))
888
+ #loc413 = loc(callsite(#loc261 at #loc362))
889
+ #loc414 = loc(callsite(#loc262 at #loc362))
890
+ #loc415 = loc(callsite(#loc263 at #loc362))
891
+ #loc416 = loc(callsite(#loc264 at #loc362))
892
+ #loc417 = loc(callsite(#loc265 at #loc362))
893
+ #loc418 = loc(callsite(#loc266 at #loc362))
894
+ #loc419 = loc(callsite(#loc267 at #loc362))
895
+ #loc420 = loc(callsite(#loc268 at #loc362))
896
+ #loc421 = loc(callsite(#loc269 at #loc362))
897
+ #loc422 = loc(callsite(#loc270 at #loc362))
898
+ #loc423 = loc(callsite(#loc271 at #loc362))
899
+ #loc424 = loc(callsite(#loc272 at #loc362))
900
+ #loc425 = loc(callsite(#loc273 at #loc362))
901
+ #loc426 = loc(callsite(#loc25 at #loc364))
902
+ #loc427 = loc(callsite(#loc25 at #loc365))
903
+ #loc428 = loc(callsite(#loc182 at #loc364))
904
+ #loc429 = loc(callsite(#loc184 at #loc364))
905
+ #loc430 = loc(callsite(#loc185 at #loc364))
906
+ #loc431 = loc(callsite(#loc187 at #loc364))
907
+ #loc432 = loc(callsite(#loc24 at #loc364))
908
+ #loc433 = loc(callsite(#loc185 at #loc365))
909
+ #loc434 = loc(callsite(#loc187 at #loc365))
910
+ #loc435 = loc(callsite(#loc82 at #loc374))
911
+ #loc437 = loc(callsite(#loc94 at #loc384))
912
+ #loc439 = loc(callsite(#loc256 at #loc391))
913
+ #loc440 = loc(callsite(#loc258 at #loc391))
914
+ #loc441 = loc(callsite(#loc259 at #loc391))
915
+ #loc442 = loc(callsite(#loc260 at #loc391))
916
+ #loc443 = loc(callsite(#loc261 at #loc391))
917
+ #loc444 = loc(callsite(#loc262 at #loc391))
918
+ #loc445 = loc(callsite(#loc263 at #loc391))
919
+ #loc446 = loc(callsite(#loc264 at #loc391))
920
+ #loc447 = loc(callsite(#loc265 at #loc391))
921
+ #loc448 = loc(callsite(#loc266 at #loc391))
922
+ #loc449 = loc(callsite(#loc267 at #loc391))
923
+ #loc450 = loc(callsite(#loc268 at #loc391))
924
+ #loc451 = loc(callsite(#loc269 at #loc391))
925
+ #loc452 = loc(callsite(#loc270 at #loc391))
926
+ #loc453 = loc(callsite(#loc271 at #loc391))
927
+ #loc454 = loc(callsite(#loc272 at #loc391))
928
+ #loc455 = loc(callsite(#loc273 at #loc391))
929
+ #loc456 = loc("offs_n"(#loc401))
930
+ #loc457 = loc(callsite(#loc84 at #loc405))
931
+ #loc458 = loc(callsite(#loc96 at #loc407))
932
+ #loc459 = loc(callsite(#loc84 at #loc435))
933
+ #loc460 = loc(callsite(#loc96 at #loc437))
934
+ #loc461 = loc("kv_offset"(#loc456))
935
+ #loc462 = loc(callsite(#loc461 at #loc42))
936
+ #loc463 = loc(callsite(#loc461 at #loc131))
progress/github/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":172:41)
4
+ #loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":520:16)
5
+ #loc87 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":403:51)
6
+ #loc99 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:34)
7
+ #loc137 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":198:45)
8
+ #loc156 = loc("arg_Q"(#loc))
9
+ #loc157 = loc("arg_K"(#loc))
10
+ #loc158 = loc("arg_V"(#loc))
11
+ #loc159 = loc("arg_LSE"(#loc))
12
+ #loc160 = loc("arg_MAX"(#loc))
13
+ #loc161 = loc("arg_KV_NUM_BLKS"(#loc))
14
+ #loc162 = loc("arg_KV_IDX"(#loc))
15
+ #loc163 = loc("arg_FULL_KV_NUM_BLKS"(#loc))
16
+ #loc164 = loc("arg_FULL_KV_IDX"(#loc))
17
+ #loc165 = loc("out_ptr0"(#loc))
18
+ #loc166 = loc("ks0"(#loc))
19
+ #loc167 = loc("ks1"(#loc))
20
+ #loc168 = loc("ks2"(#loc))
21
+ #loc169 = loc("ks3"(#loc))
22
+ #loc170 = loc("ks4"(#loc))
23
+ #loc210 = loc(callsite(#loc48 at #loc2))
24
+ #loc247 = loc("m_ij"(#loc87))
25
+ #loc257 = loc("l_i"(#loc99))
26
+ #loc293 = loc(callsite(#loc48 at #loc137))
27
+ #loc354 = loc(callsite(#loc247 at #loc210))
28
+ #loc364 = loc(callsite(#loc257 at #loc210))
29
+ #loc383 = loc(callsite(#loc247 at #loc293))
30
+ #loc393 = loc(callsite(#loc257 at #loc293))
31
+ #loc413 = loc(callsite(#loc1 at #loc354))
32
+ #loc415 = loc(callsite(#loc1 at #loc364))
33
+ #loc443 = loc(callsite(#loc1 at #loc383))
34
+ #loc445 = loc(callsite(#loc1 at #loc393))
35
+ module {
36
+ tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc))) attributes {noinline = false} {
37
+ %cst = arith.constant dense<1024> : tensor<64x1xi32> loc(#loc1)
38
+ %c0_i32 = arith.constant 0 : i32 loc(#loc1)
39
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1)
40
+ %cst_1 = arith.constant dense<16> : tensor<1x64xi32> loc(#loc171)
41
+ %cst_2 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc171)
42
+ %cst_3 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1)
43
+ %cst_4 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1)
44
+ %cst_5 = arith.constant dense<false> : tensor<128x64xi1> loc(#loc171)
45
+ %cst_6 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc171)
46
+ %cst_7 = arith.constant dense<1> : tensor<128x1xi32> loc(#loc171)
47
+ %cst_8 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc171)
48
+ %cst_9 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc171)
49
+ %cst_10 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1)
50
+ %cst_11 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1)
51
+ %cst_12 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1)
52
+ %c63_i32 = arith.constant 63 : i32 loc(#loc1)
53
+ %c64_i32 = arith.constant 64 : i32 loc(#loc1)
54
+ %q = arith.constant dense<0.000000e+00> : tensor<128x128xbf16> loc(#loc306)
55
+ %acc = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc307)
56
+ %cst_13 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1)
57
+ %mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc174)
58
+ %l_i = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc175)
59
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1)
60
+ %c2_i32 = arith.constant 2 : i32 loc(#loc1)
61
+ %c4_i32 = arith.constant 4 : i32 loc(#loc1)
62
+ %HQ = arith.constant 32 : i32 loc(#loc176)
63
+ %c1_i32 = arith.constant 1 : i32 loc(#loc1)
64
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
65
+ %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
66
+ %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc10)
67
+ %q_start = tt.get_program_id x : i32 loc(#loc177)
68
+ %off_zq = tt.get_program_id y : i32 loc(#loc178)
69
+ %off_hq = tt.get_program_id z : i32 loc(#loc179)
70
+ %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc180)
71
+ %q_offset = arith.muli %off_zq, %0 : i32 loc(#loc181)
72
+ %q_offset_15 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc182)
73
+ %q_offset_16 = arith.addi %q_offset, %q_offset_15 : i32 loc(#loc183)
74
+ %k_offset = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc184)
75
+ %Q = tt.addptr %arg_Q, %q_offset_16 : !tt.ptr<bf16>, i32 loc(#loc185)
76
+ %K = tt.addptr %arg_K, %k_offset : !tt.ptr<bf16>, i32 loc(#loc186)
77
+ %V = tt.addptr %arg_V, %k_offset : !tt.ptr<bf16>, i32 loc(#loc187)
78
+ %sparse_kv_idx_offset = arith.muli %q_start, %ks4 : i32 loc(#loc188)
79
+ %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc189)
80
+ %offs_m_17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc190)
81
+ %offs_m_18 = tt.splat %offs_m : i32 -> tensor<128xi32> loc(#loc191)
82
+ %offs_m_19 = arith.addi %offs_m_18, %offs_m_17 : tensor<128xi32> loc(#loc191)
83
+ %ptr = tt.expand_dims %offs_m_19 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc308)
84
+ %ptr_20 = arith.muli %ptr, %cst_13 : tensor<128x1xi32> loc(#loc309)
85
+ %ptr_21 = tt.splat %Q : !tt.ptr<bf16> -> tensor<128x1x!tt.ptr<bf16>> loc(#loc310)
86
+ %ptr_22 = tt.addptr %ptr_21, %ptr_20 : tensor<128x1x!tt.ptr<bf16>>, tensor<128x1xi32> loc(#loc310)
87
+ %ptr_23 = tt.expand_dims %offs_m_17 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc311)
88
+ %ptr_24 = tt.broadcast %ptr_22 : tensor<128x1x!tt.ptr<bf16>> -> tensor<128x128x!tt.ptr<bf16>> loc(#loc312)
89
+ %ptr_25 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc312)
90
+ %ptr_26 = tt.addptr %ptr_24, %ptr_25 : tensor<128x128x!tt.ptr<bf16>>, tensor<128x128xi32> loc(#loc312)
91
+ %q_27 = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc313)
92
+ %q_28 = arith.cmpi slt, %ptr, %q_27 : tensor<128x1xi32> loc(#loc313)
93
+ %q_29 = tt.broadcast %q_28 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc306)
94
+ %q_30 = tt.load %ptr_26, %q_29, %q : tensor<128x128x!tt.ptr<bf16>> loc(#loc306)
95
+ %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc197)
96
+ %kv_start = tt.load %kv_indices : !tt.ptr<i32> loc(#loc198)
97
+ %kv_start_31 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc199)
98
+ %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc200)
99
+ %kv_num_blocks_32 = tt.load %kv_num_blocks : !tt.ptr<i32> loc(#loc201)
100
+ %block_n_end = arith.muli %kv_num_blocks_32, %c2_i32 : i32 loc(#loc202)
101
+ %block_n_end_33 = arith.addi %ks1, %c63_i32 : i32 loc(#loc314)
102
+ %block_n_end_34 = arith.divsi %block_n_end_33, %c64_i32 : i32 loc(#loc315)
103
+ %block_n_end_35 = arith.maxsi %block_n_end_34, %c1_i32 : i32 loc(#loc204)
104
+ %block_n_end_36 = arith.minsi %block_n_end, %block_n_end_35 : i32 loc(#loc205)
105
+ %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc206)
106
+ %offs_n_37 = tt.splat %kv_start_31 : i32 -> tensor<64xi32> loc(#loc207)
107
+ %offs_n_38 = arith.addi %offs_n_37, %offs_n : tensor<64xi32> loc(#loc207)
108
+ %1 = tt.expand_dims %offs_n_38 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc45)
109
+ %kv_offset:5 = scf.for %start_n = %c0_i32 to %block_n_end_36 step %c1_i32 iter_args(%acc_62 = %acc, %l_i_63 = %cst_14, %m_i = %cst_3, %offs_n_64 = %1, %kv_offset_65 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 {
110
+ %kv_base_offset = arith.addi %kv_start_31, %kv_offset_65 : i32 loc(#loc317)
111
+ %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc318)
112
+ %offs_n_load_66 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc318)
113
+ %ptr_67 = tt.expand_dims %offs_n_load_66 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc404)
114
+ %ptr_68 = arith.muli %ptr_67, %cst : tensor<64x1xi32> loc(#loc405)
115
+ %ptr_69 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc406)
116
+ %ptr_70 = tt.addptr %ptr_69, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc406)
117
+ %ptr_71 = tt.broadcast %ptr_70 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc407)
118
+ %ptr_72 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc407)
119
+ %ptr_73 = tt.addptr %ptr_71, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc407)
120
+ %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc408)
121
+ %k_74 = arith.cmpi slt, %ptr_67, %k : tensor<64x1xi32> loc(#loc408)
122
+ %k_75 = tt.broadcast %k_74 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc409)
123
+ %k_76 = tt.load %ptr_73, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc409)
124
+ %k_77 = tt.trans %k_76 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc320)
125
+ %qk = tt.dot %q_30, %k_77, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc321)
126
+ %qk_78 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc322)
127
+ %m = arith.remsi %ptr, %q_27 : tensor<128x1xi32> loc(#loc410)
128
+ %n = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc411)
129
+ %n_79 = arith.remsi %offs_n_64, %n : tensor<1x64xi32> loc(#loc411)
130
+ %post_mod_scores = arith.cmpi slt, %offs_n_64, %n : tensor<1x64xi32> loc(#loc325)
131
+ %post_mod_scores_80 = tt.broadcast %post_mod_scores : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc326)
132
+ %post_mod_scores_81 = arith.select %post_mod_scores_80, %qk_78, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc326)
133
+ %tmp3 = arith.cmpi slt, %m, %cst_8 : tensor<128x1xi32> loc(#loc327)
134
+ %tmp5 = tt.broadcast %n_79 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc328)
135
+ %tmp5_82 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc328)
136
+ %tmp5_83 = arith.cmpi sle, %tmp5, %tmp5_82 : tensor<128x64xi32> loc(#loc328)
137
+ %tmp6 = tt.broadcast %tmp3 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc329)
138
+ %tmp6_84 = arith.andi %tmp6, %tmp5_83 : tensor<128x64xi1> loc(#loc329)
139
+ %tmp7 = arith.cmpi sge, %m, %cst_8 : tensor<128x1xi32> loc(#loc330)
140
+ %tmp8 = arith.cmpi slt, %n_79, %cst_9 : tensor<1x64xi32> loc(#loc331)
141
+ %tmp9 = tt.broadcast %tmp7 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc332)
142
+ %tmp9_85 = tt.broadcast %tmp8 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc332)
143
+ %tmp9_86 = arith.andi %tmp9, %tmp9_85 : tensor<128x64xi1> loc(#loc332)
144
+ %tmp10 = arith.extui %tmp8 : tensor<1x64xi1> to tensor<1x64xi32> loc(#loc333)
145
+ %tmp10_87 = arith.cmpi eq, %tmp10, %cst_9 : tensor<1x64xi32> loc(#loc333)
146
+ %tmp11 = tt.broadcast %tmp10_87 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc334)
147
+ %tmp11_88 = arith.andi %tmp9, %tmp11 : tensor<128x64xi1> loc(#loc334)
148
+ %tmp14 = arith.remsi %m, %cst_2 : tensor<128x1xi32> loc(#loc335)
149
+ %tmp14_89 = arith.cmpi ne, %tmp14, %cst_8 : tensor<128x1xi32> loc(#loc336)
150
+ %tmp14_90 = arith.divsi %m, %cst_2 : tensor<128x1xi32> loc(#loc337)
151
+ %tmp14_91 = arith.subi %tmp14_90, %cst_7 : tensor<128x1xi32> loc(#loc338)
152
+ %tmp14_92 = arith.select %tmp14_89, %tmp14_91, %tmp14_90 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc339)
153
+ %tmp14_93 = arith.select %tmp3, %tmp14_92, %tmp14_90 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc340)
154
+ %tmp16 = arith.remsi %n_79, %cst_1 : tensor<1x64xi32> loc(#loc341)
155
+ %tmp16_94 = arith.cmpi ne, %tmp16, %cst_9 : tensor<1x64xi32> loc(#loc342)
156
+ %tmp16_95 = arith.divsi %n_79, %cst_1 : tensor<1x64xi32> loc(#loc343)
157
+ %tmp16_96 = arith.subi %tmp16_95, %cst_6 : tensor<1x64xi32> loc(#loc344)
158
+ %tmp16_97 = arith.select %tmp16_94, %tmp16_96, %tmp16_95 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc345)
159
+ %tmp16_98 = arith.select %tmp8, %tmp16_97, %tmp16_95 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc346)
160
+ %tmp17 = tt.broadcast %tmp14_93 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc347)
161
+ %tmp17_99 = tt.broadcast %tmp16_98 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc347)
162
+ %tmp17_100 = arith.cmpi eq, %tmp17, %tmp17_99 : tensor<128x64xi32> loc(#loc347)
163
+ %tmp18 = arith.andi %tmp11_88, %tmp17_100 : tensor<128x64xi1> loc(#loc348)
164
+ %tmp19 = arith.ori %tmp9_86, %tmp18 : tensor<128x64xi1> loc(#loc349)
165
+ %tmp20 = arith.ori %tmp6_84, %tmp19 : tensor<128x64xi1> loc(#loc350)
166
+ %mask_mod_output = arith.select %post_mod_scores_80, %tmp20, %cst_5 : tensor<128x64xi1>, tensor<128x64xi1> loc(#loc351)
167
+ %post_mod_scores_101 = arith.select %mask_mod_output, %post_mod_scores_81, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc352)
168
+ %post_mod_scores_102 = arith.mulf %post_mod_scores_101, %cst_4 : tensor<128x64xf32> loc(#loc353)
169
+ %m_ij = "tt.reduce"(%post_mod_scores_102) <{axis = 1 : i32}> ({
170
+ ^bb0(%m_ij_135: f32 loc(callsite(#loc1 at #loc354)), %m_ij_136: f32 loc(callsite(#loc1 at #loc354))):
171
+ %m_ij_137 = arith.maxnumf %m_ij_135, %m_ij_136 : f32 loc(#loc467)
172
+ tt.reduce.return %m_ij_137 : f32 loc(#loc412)
173
+ }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc412)
174
+ %m_ij_103 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc355)
175
+ %masked_out_rows = arith.cmpf oeq, %m_ij_103, %cst_3 : tensor<128xf32> loc(#loc356)
176
+ %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_103 : tensor<128xi1>, tensor<128xf32> loc(#loc357)
177
+ %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc358)
178
+ %alpha_104 = math.exp2 %alpha : tensor<128xf32> loc(#loc359)
179
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc360)
180
+ %p_105 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc361)
181
+ %p_106 = arith.subf %post_mod_scores_102, %p_105 : tensor<128x64xf32> loc(#loc361)
182
+ %p_107 = math.exp2 %p_106 : tensor<128x64xf32> loc(#loc362)
183
+ %l_i_108 = arith.mulf %l_i_63, %alpha_104 : tensor<128xf32> loc(#loc363)
184
+ %l_i_109 = "tt.reduce"(%p_107) <{axis = 1 : i32}> ({
185
+ ^bb0(%l_i_135: f32 loc(callsite(#loc1 at #loc364)), %l_i_136: f32 loc(callsite(#loc1 at #loc364))):
186
+ %l_i_137 = arith.addf %l_i_135, %l_i_136 : f32 loc(#loc468)
187
+ tt.reduce.return %l_i_137 : f32 loc(#loc414)
188
+ }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc414)
189
+ %l_i_110 = arith.addf %l_i_108, %l_i_109 : tensor<128xf32> loc(#loc365)
190
+ %acc_111 = tt.expand_dims %alpha_104 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc366)
191
+ %acc_112 = tt.broadcast %acc_111 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc367)
192
+ %acc_113 = arith.mulf %acc_62, %acc_112 : tensor<128x128xf32> loc(#loc367)
193
+ %ptr_114 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc416)
194
+ %ptr_115 = tt.addptr %ptr_114, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc416)
195
+ %ptr_116 = tt.broadcast %ptr_115 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc417)
196
+ %ptr_117 = tt.addptr %ptr_116, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc417)
197
+ %v = tt.load %ptr_117, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc418)
198
+ %acc_118 = arith.truncf %p_107 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc369)
199
+ %acc_119 = tt.dot %acc_118, %v, %acc_113, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc370)
200
+ %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc419)
201
+ %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc420)
202
+ %cur_block_120 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc421)
203
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc422)
204
+ %next_block_121 = arith.cmpi slt, %next_block, %kv_num_blocks_32 : i32 loc(#loc423)
205
+ %next_block_122 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc424)
206
+ %next_block_123 = tt.load %next_block_122, %next_block_121 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc425)
207
+ %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc426)
208
+ %needs_jump_124 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc427)
209
+ %needs_jump_125 = arith.cmpi eq, %needs_jump_124, %c0_i32 : i32 loc(#loc428)
210
+ %jump_to_block = arith.subi %next_block_123, %cur_block_120 : i32 loc(#loc429)
211
+ %jump_to_block_126 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc430)
212
+ %jump_to_block_127 = arith.subi %jump_to_block_126, %c64_i32 : i32 loc(#loc431)
213
+ %offset = arith.extui %needs_jump_125 : i1 to i32 loc(#loc432)
214
+ %offset_128 = arith.muli %jump_to_block_127, %offset : i32 loc(#loc432)
215
+ %offset_129 = arith.subi %c1_i32, %offset : i32 loc(#loc433)
216
+ %offset_130 = arith.muli %offset_129, %c64_i32 : i32 loc(#loc434)
217
+ %offset_131 = arith.addi %offset_128, %offset_130 : i32 loc(#loc435)
218
+ %offs_n_132 = tt.splat %offset_131 : i32 -> tensor<1x64xi32> loc(#loc372)
219
+ %offs_n_133 = arith.addi %offs_n_64, %offs_n_132 : tensor<1x64xi32> loc(#loc372)
220
+ %kv_offset_134 = arith.addi %kv_offset_65, %offset_131 : i32 loc(#loc373)
221
+ scf.yield %acc_119, %l_i_110, %m_ij_103, %offs_n_133, %kv_offset_134 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc284)
222
+ } loc(#loc472)
223
+ %kv_indices_39 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc285)
224
+ %kv_start_40 = tt.load %kv_indices_39 : !tt.ptr<i32> loc(#loc286)
225
+ %kv_start_41 = arith.muli %kv_start_40, %c128_i32 : i32 loc(#loc287)
226
+ %kv_num_blocks_42 = tt.addptr %arg_FULL_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc288)
227
+ %kv_num_blocks_43 = tt.load %kv_num_blocks_42 : !tt.ptr<i32> loc(#loc289)
228
+ %block_n_end_44 = arith.muli %kv_num_blocks_43, %c2_i32 : i32 loc(#loc290)
229
+ %block_n_end_45 = arith.minsi %block_n_end_44, %block_n_end_35 : i32 loc(#loc291)
230
+ %offs_n_46 = tt.splat %kv_start_41 : i32 -> tensor<64xi32> loc(#loc292)
231
+ %offs_n_47 = arith.addi %offs_n_46, %offs_n : tensor<64xi32> loc(#loc292)
232
+ %2 = tt.expand_dims %offs_n_47 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc136)
233
+ %kv_offset_48:5 = scf.for %start_n = %c0_i32 to %block_n_end_45 step %c1_i32 iter_args(%acc_62 = %kv_offset#0, %l_i_63 = %kv_offset#1, %m_i = %kv_offset#2, %offs_n_64 = %2, %kv_offset_65 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 {
234
+ %kv_base_offset = arith.addi %kv_start_41, %kv_offset_65 : i32 loc(#loc374)
235
+ %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc375)
236
+ %offs_n_load_66 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc375)
237
+ %ptr_67 = tt.expand_dims %offs_n_load_66 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc436)
238
+ %ptr_68 = arith.muli %ptr_67, %cst : tensor<64x1xi32> loc(#loc437)
239
+ %ptr_69 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc438)
240
+ %ptr_70 = tt.addptr %ptr_69, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc438)
241
+ %ptr_71 = tt.broadcast %ptr_70 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc439)
242
+ %ptr_72 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc439)
243
+ %ptr_73 = tt.addptr %ptr_71, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc439)
244
+ %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc440)
245
+ %k_74 = arith.cmpi slt, %ptr_67, %k : tensor<64x1xi32> loc(#loc440)
246
+ %k_75 = tt.broadcast %k_74 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc441)
247
+ %k_76 = tt.load %ptr_73, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc441)
248
+ %k_77 = tt.trans %k_76 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc377)
249
+ %qk = tt.dot %q_30, %k_77, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc378)
250
+ %qk_78 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc379)
251
+ %post_mod_scores = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc380)
252
+ %post_mod_scores_79 = arith.cmpi slt, %offs_n_64, %post_mod_scores : tensor<1x64xi32> loc(#loc380)
253
+ %post_mod_scores_80 = tt.broadcast %post_mod_scores_79 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc381)
254
+ %post_mod_scores_81 = arith.select %post_mod_scores_80, %qk_78, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc381)
255
+ %post_mod_scores_82 = arith.mulf %post_mod_scores_81, %cst_4 : tensor<128x64xf32> loc(#loc382)
256
+ %m_ij = "tt.reduce"(%post_mod_scores_82) <{axis = 1 : i32}> ({
257
+ ^bb0(%m_ij_115: f32 loc(callsite(#loc1 at #loc383)), %m_ij_116: f32 loc(callsite(#loc1 at #loc383))):
258
+ %m_ij_117 = arith.maxnumf %m_ij_115, %m_ij_116 : f32 loc(#loc469)
259
+ tt.reduce.return %m_ij_117 : f32 loc(#loc442)
260
+ }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc442)
261
+ %m_ij_83 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc384)
262
+ %masked_out_rows = arith.cmpf oeq, %m_ij_83, %cst_3 : tensor<128xf32> loc(#loc385)
263
+ %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_83 : tensor<128xi1>, tensor<128xf32> loc(#loc386)
264
+ %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc387)
265
+ %alpha_84 = math.exp2 %alpha : tensor<128xf32> loc(#loc388)
266
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc389)
267
+ %p_85 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc390)
268
+ %p_86 = arith.subf %post_mod_scores_82, %p_85 : tensor<128x64xf32> loc(#loc390)
269
+ %p_87 = math.exp2 %p_86 : tensor<128x64xf32> loc(#loc391)
270
+ %l_i_88 = arith.mulf %l_i_63, %alpha_84 : tensor<128xf32> loc(#loc392)
271
+ %l_i_89 = "tt.reduce"(%p_87) <{axis = 1 : i32}> ({
272
+ ^bb0(%l_i_115: f32 loc(callsite(#loc1 at #loc393)), %l_i_116: f32 loc(callsite(#loc1 at #loc393))):
273
+ %l_i_117 = arith.addf %l_i_115, %l_i_116 : f32 loc(#loc470)
274
+ tt.reduce.return %l_i_117 : f32 loc(#loc444)
275
+ }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc444)
276
+ %l_i_90 = arith.addf %l_i_88, %l_i_89 : tensor<128xf32> loc(#loc394)
277
+ %acc_91 = tt.expand_dims %alpha_84 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc395)
278
+ %acc_92 = tt.broadcast %acc_91 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc396)
279
+ %acc_93 = arith.mulf %acc_62, %acc_92 : tensor<128x128xf32> loc(#loc396)
280
+ %ptr_94 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc446)
281
+ %ptr_95 = tt.addptr %ptr_94, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc446)
282
+ %ptr_96 = tt.broadcast %ptr_95 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc447)
283
+ %ptr_97 = tt.addptr %ptr_96, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc447)
284
+ %v = tt.load %ptr_97, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc448)
285
+ %acc_98 = arith.truncf %p_87 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc398)
286
+ %acc_99 = tt.dot %acc_98, %v, %acc_93, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc399)
287
+ %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc449)
288
+ %cur_block = tt.addptr %kv_indices_39, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc450)
289
+ %cur_block_100 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc451)
290
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc452)
291
+ %next_block_101 = arith.cmpi slt, %next_block, %kv_num_blocks_43 : i32 loc(#loc453)
292
+ %next_block_102 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc454)
293
+ %next_block_103 = tt.load %next_block_102, %next_block_101 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc455)
294
+ %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc456)
295
+ %needs_jump_104 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc457)
296
+ %needs_jump_105 = arith.cmpi eq, %needs_jump_104, %c0_i32 : i32 loc(#loc458)
297
+ %jump_to_block = arith.subi %next_block_103, %cur_block_100 : i32 loc(#loc459)
298
+ %jump_to_block_106 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc460)
299
+ %jump_to_block_107 = arith.subi %jump_to_block_106, %c64_i32 : i32 loc(#loc461)
300
+ %offset = arith.extui %needs_jump_105 : i1 to i32 loc(#loc462)
301
+ %offset_108 = arith.muli %jump_to_block_107, %offset : i32 loc(#loc462)
302
+ %offset_109 = arith.subi %c1_i32, %offset : i32 loc(#loc463)
303
+ %offset_110 = arith.muli %offset_109, %c64_i32 : i32 loc(#loc464)
304
+ %offset_111 = arith.addi %offset_108, %offset_110 : i32 loc(#loc465)
305
+ %offs_n_112 = tt.splat %offset_111 : i32 -> tensor<1x64xi32> loc(#loc401)
306
+ %offs_n_113 = arith.addi %offs_n_64, %offs_n_112 : tensor<1x64xi32> loc(#loc401)
307
+ %kv_offset_114 = arith.addi %kv_offset_65, %offset_111 : i32 loc(#loc402)
308
+ scf.yield %acc_99, %l_i_90, %m_ij_83, %offs_n_113, %kv_offset_114 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc294)
309
+ } loc(#loc473)
310
+ %l_i_49 = arith.cmpf oeq, %kv_offset_48#1, %cst_14 : tensor<128xf32> loc(#loc295)
311
+ %l_i_50 = arith.select %l_i_49, %l_i, %kv_offset_48#1 : tensor<128xi1>, tensor<128xf32> loc(#loc175)
312
+ %acc_51 = tt.expand_dims %l_i_50 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc296)
313
+ %acc_52 = tt.broadcast %acc_51 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc297)
314
+ %acc_53 = arith.divf %kv_offset_48#0, %acc_52 : tensor<128x128xf32> loc(#loc297)
315
+ %mask_54 = arith.cmpi slt, %ptr_23, %mask : tensor<1x128xi32> loc(#loc174)
316
+ %mask_55 = tt.broadcast %mask_54 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc298)
317
+ %mask_56 = arith.andi %q_29, %mask_55 : tensor<128x128xi1> loc(#loc298)
318
+ %3 = tt.splat %q_offset_15 : i32 -> tensor<1x128xi32> loc(#loc142)
319
+ %4 = arith.addi %ptr_23, %3 : tensor<1x128xi32> loc(#loc142)
320
+ %5 = tt.broadcast %4 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc143)
321
+ %6 = tt.broadcast %ptr_20 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc143)
322
+ %7 = arith.addi %5, %6 : tensor<128x128xi32> loc(#loc143)
323
+ %8 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<128x128x!tt.ptr<bf16>> loc(#loc144)
324
+ %9 = tt.addptr %8, %7 : tensor<128x128x!tt.ptr<bf16>>, tensor<128x128xi32> loc(#loc144)
325
+ %10 = arith.truncf %acc_53 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc145)
326
+ tt.store %9, %10, %mask_56 : tensor<128x128x!tt.ptr<bf16>> loc(#loc145)
327
+ %off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc299)
328
+ %off_hz_57 = arith.addi %off_hz, %off_hq : i32 loc(#loc300)
329
+ %l_ptrs = arith.muli %off_hz_57, %ks0 : i32 loc(#loc301)
330
+ %l_ptrs_58 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr<f32>, i32 loc(#loc302)
331
+ %l_ptrs_59 = tt.splat %l_ptrs_58 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc303)
332
+ %l_ptrs_60 = tt.addptr %l_ptrs_59, %offs_m_19 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc303)
333
+ %lse = math.log2 %l_i_50 : tensor<128xf32> loc(#loc304)
334
+ %lse_61 = arith.addf %kv_offset_48#2, %lse : tensor<128xf32> loc(#loc305)
335
+ %11 = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc153)
336
+ %12 = arith.cmpi slt, %offs_m_19, %11 : tensor<128xi32> loc(#loc153)
337
+ tt.store %l_ptrs_60, %lse_61, %12 : tensor<128x!tt.ptr<f32>> loc(#loc154)
338
+ tt.return loc(#loc155)
339
+ } loc(#loc)
340
+ } loc(#loc)
341
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":292:23)
342
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":146:101)
343
+ #loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
344
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":136:19)
345
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":214:38)
346
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":206:34)
347
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":90:9)
348
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":85:54)
349
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":97:28)
350
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":98:27)
351
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":99:27)
352
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":104:24)
353
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:24)
354
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:45)
355
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:36)
356
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":108:47)
357
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":111:12)
358
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":112:12)
359
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":113:12)
360
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":143:97)
361
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:23)
362
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:46)
363
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:33)
364
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:27)
365
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:38)
366
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:20)
367
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:56)
368
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:49)
369
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":292:52)
370
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":151:26)
371
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":152:23)
372
+ #loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":152:37)
373
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":153:42)
374
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":153:28)
375
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:45)
376
+ #loc38 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22)
377
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:92)
378
+ #loc40 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28)
379
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:102)
380
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:65)
381
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":159:37)
382
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":159:24)
383
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":167:48)
384
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":484:40)
385
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":342:32)
386
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":346:35)
387
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":347:107)
388
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":349:17)
389
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":351:19)
390
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":353:14)
391
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":257:21)
392
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":358:36)
393
+ #loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":359:36)
394
+ #loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":367:44)
395
+ #loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":367:69)
396
+ #loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":372:22)
397
+ #loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":374:23)
398
+ #loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":375:22)
399
+ #loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":376:23)
400
+ #loc63 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":377:22)
401
+ #loc64 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":378:22)
402
+ #loc65 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":379:24)
403
+ #loc66 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":380:23)
404
+ #loc67 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:70)
405
+ #loc68 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:79)
406
+ #loc69 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:91)
407
+ #loc70 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:99)
408
+ #loc71 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:102)
409
+ #loc72 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:119)
410
+ #loc73 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:70)
411
+ #loc74 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:79)
412
+ #loc75 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:91)
413
+ #loc76 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:99)
414
+ #loc77 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:102)
415
+ #loc78 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:119)
416
+ #loc79 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":386:25)
417
+ #loc80 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":387:24)
418
+ #loc81 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":388:23)
419
+ #loc82 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":389:23)
420
+ #loc83 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":394:73)
421
+ #loc84 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":396:69)
422
+ #loc85 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":399:27)
423
+ #loc86 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40)
424
+ #loc88 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27)
425
+ #loc89 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":403:27)
426
+ #loc90 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":405:35)
427
+ #loc91 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":406:51)
428
+ #loc92 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":410:31)
429
+ #loc93 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":410:25)
430
+ #loc94 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:51)
431
+ #loc95 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:39)
432
+ #loc96 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:21)
433
+ #loc97 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:16)
434
+ #loc98 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
435
+ #loc100 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
436
+ #loc101 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:24)
437
+ #loc102 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":418:22)
438
+ #loc103 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":418:16)
439
+ #loc104 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":421:107)
440
+ #loc105 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":422:22)
441
+ #loc106 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":422:44)
442
+ #loc107 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":247:33)
443
+ #loc108 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":527:63)
444
+ #loc109 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":248:38)
445
+ #loc110 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":248:24)
446
+ #loc111 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:109)
447
+ #loc112 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:113)
448
+ #loc113 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:55)
449
+ #loc114 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:25)
450
+ #loc115 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:30)
451
+ #loc116 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:35)
452
+ #loc117 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:60)
453
+ #loc118 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:34)
454
+ #loc119 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:48)
455
+ #loc120 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:63)
456
+ #loc121 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:29)
457
+ #loc122 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:47)
458
+ #loc123 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:61)
459
+ #loc124 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:42)
460
+ #loc125 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":530:26)
461
+ #loc126 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":531:21)
462
+ #loc127 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":531:8)
463
+ #loc128 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":181:35)
464
+ #loc129 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":182:27)
465
+ #loc130 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":182:41)
466
+ #loc131 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":183:51)
467
+ #loc132 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":183:32)
468
+ #loc133 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":184:49)
469
+ #loc134 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":184:69)
470
+ #loc135 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":186:28)
471
+ #loc136 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":193:52)
472
+ #loc138 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":206:26)
473
+ #loc139 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":208:20)
474
+ #loc140 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":208:16)
475
+ #loc141 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":214:30)
476
+ #loc142 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:49)
477
+ #loc143 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:62)
478
+ #loc144 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:25)
479
+ #loc145 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:92)
480
+ #loc146 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":221:26)
481
+ #loc147 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":221:31)
482
+ #loc148 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:32)
483
+ #loc149 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:23)
484
+ #loc150 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:40)
485
+ #loc151 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":223:33)
486
+ #loc152 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":223:20)
487
+ #loc153 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":227:48)
488
+ #loc154 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":227:29)
489
+ #loc155 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":229:4)
490
+ #loc171 = loc(callsite(#loc1 at #loc2))
491
+ #loc172 = loc("q"(#loc4))
492
+ #loc173 = loc("acc"(#loc6))
493
+ #loc174 = loc("mask"(#loc7))
494
+ #loc175 = loc("l_i"(#loc8))
495
+ #loc176 = loc("HQ"(#loc9))
496
+ #loc177 = loc("q_start"(#loc11))
497
+ #loc178 = loc("off_zq"(#loc12))
498
+ #loc179 = loc("off_hq"(#loc13))
499
+ #loc180 = loc("off_hkv"(#loc14))
500
+ #loc181 = loc("q_offset"(#loc15))
501
+ #loc182 = loc("q_offset"(#loc16))
502
+ #loc183 = loc("q_offset"(#loc17))
503
+ #loc184 = loc("k_offset"(#loc18))
504
+ #loc185 = loc("Q"(#loc19))
505
+ #loc186 = loc("K"(#loc20))
506
+ #loc187 = loc("V"(#loc21))
507
+ #loc188 = loc("sparse_kv_idx_offset"(#loc22))
508
+ #loc189 = loc("offs_m"(#loc23))
509
+ #loc190 = loc("offs_m"(#loc24))
510
+ #loc191 = loc("offs_m"(#loc25))
511
+ #loc192 = loc("ptr"(#loc26))
512
+ #loc193 = loc("ptr"(#loc27))
513
+ #loc194 = loc("ptr"(#loc28))
514
+ #loc195 = loc("ptr"(#loc29))
515
+ #loc196 = loc("ptr"(#loc30))
516
+ #loc197 = loc("kv_indices"(#loc32))
517
+ #loc198 = loc("kv_start"(#loc33))
518
+ #loc199 = loc("kv_start"(#loc34))
519
+ #loc200 = loc("kv_num_blocks"(#loc35))
520
+ #loc201 = loc("kv_num_blocks"(#loc36))
521
+ #loc202 = loc("block_n_end"(#loc37))
522
+ #loc203 = loc("block_n_end"(#loc39))
523
+ #loc204 = loc("block_n_end"(#loc41))
524
+ #loc205 = loc("block_n_end"(#loc42))
525
+ #loc206 = loc("offs_n"(#loc43))
526
+ #loc207 = loc("offs_n"(#loc44))
527
+ #loc208 = loc("acc"(#loc46))
528
+ #loc209 = loc("kv_base_offset"(#loc47))
529
+ #loc211 = loc("offs_n_load"(#loc49))
530
+ #loc212 = loc("k"(#loc50))
531
+ #loc213 = loc("k"(#loc51))
532
+ #loc214 = loc("qk"(#loc52))
533
+ #loc215 = loc("qk"(#loc53))
534
+ #loc216 = loc("m"(#loc55))
535
+ #loc217 = loc("n"(#loc56))
536
+ #loc218 = loc("post_mod_scores"(#loc57))
537
+ #loc219 = loc("post_mod_scores"(#loc58))
538
+ #loc220 = loc("tmp3"(#loc59))
539
+ #loc221 = loc("tmp5"(#loc60))
540
+ #loc222 = loc("tmp6"(#loc61))
541
+ #loc223 = loc("tmp7"(#loc62))
542
+ #loc224 = loc("tmp8"(#loc63))
543
+ #loc225 = loc("tmp9"(#loc64))
544
+ #loc226 = loc("tmp10"(#loc65))
545
+ #loc227 = loc("tmp11"(#loc66))
546
+ #loc228 = loc("tmp14"(#loc67))
547
+ #loc229 = loc("tmp14"(#loc68))
548
+ #loc230 = loc("tmp14"(#loc69))
549
+ #loc231 = loc("tmp14"(#loc70))
550
+ #loc232 = loc("tmp14"(#loc71))
551
+ #loc233 = loc("tmp14"(#loc72))
552
+ #loc234 = loc("tmp16"(#loc73))
553
+ #loc235 = loc("tmp16"(#loc74))
554
+ #loc236 = loc("tmp16"(#loc75))
555
+ #loc237 = loc("tmp16"(#loc76))
556
+ #loc238 = loc("tmp16"(#loc77))
557
+ #loc239 = loc("tmp16"(#loc78))
558
+ #loc240 = loc("tmp17"(#loc79))
559
+ #loc241 = loc("tmp18"(#loc80))
560
+ #loc242 = loc("tmp19"(#loc81))
561
+ #loc243 = loc("tmp20"(#loc82))
562
+ #loc244 = loc("mask_mod_output"(#loc83))
563
+ #loc245 = loc("post_mod_scores"(#loc84))
564
+ #loc246 = loc("post_mod_scores"(#loc85))
565
+ #loc248 = loc("m_ij"(#loc89))
566
+ #loc249 = loc("masked_out_rows"(#loc90))
567
+ #loc250 = loc("m_ij_masked"(#loc91))
568
+ #loc251 = loc("alpha"(#loc92))
569
+ #loc252 = loc("alpha"(#loc93))
570
+ #loc253 = loc("p"(#loc94))
571
+ #loc254 = loc("p"(#loc95))
572
+ #loc255 = loc("p"(#loc96))
573
+ #loc256 = loc("l_i"(#loc97))
574
+ #loc258 = loc("l_i"(#loc101))
575
+ #loc259 = loc("acc"(#loc102))
576
+ #loc260 = loc("acc"(#loc103))
577
+ #loc261 = loc("v"(#loc104))
578
+ #loc262 = loc("acc"(#loc105))
579
+ #loc263 = loc("acc"(#loc106))
580
+ #loc264 = loc("cur_block_idx"(#loc107))
581
+ #loc265 = loc("offset"(#loc108))
582
+ #loc266 = loc("cur_block"(#loc109))
583
+ #loc267 = loc("cur_block"(#loc110))
584
+ #loc268 = loc("next_block"(#loc111))
585
+ #loc269 = loc("next_block"(#loc112))
586
+ #loc270 = loc("next_block"(#loc113))
587
+ #loc271 = loc("next_block"(#loc114))
588
+ #loc272 = loc("needs_jump"(#loc115))
589
+ #loc273 = loc("needs_jump"(#loc116))
590
+ #loc274 = loc("needs_jump"(#loc117))
591
+ #loc275 = loc("jump_to_block"(#loc118))
592
+ #loc276 = loc("jump_to_block"(#loc119))
593
+ #loc277 = loc("jump_to_block"(#loc120))
594
+ #loc278 = loc("offset"(#loc121))
595
+ #loc279 = loc("offset"(#loc122))
596
+ #loc280 = loc("offset"(#loc123))
597
+ #loc281 = loc("offset"(#loc124))
598
+ #loc282 = loc("offs_n"(#loc125))
599
+ #loc283 = loc("kv_offset"(#loc126))
600
+ #loc284 = loc(callsite(#loc127 at #loc2))
601
+ #loc285 = loc("kv_indices"(#loc128))
602
+ #loc286 = loc("kv_start"(#loc129))
603
+ #loc287 = loc("kv_start"(#loc130))
604
+ #loc288 = loc("kv_num_blocks"(#loc131))
605
+ #loc289 = loc("kv_num_blocks"(#loc132))
606
+ #loc290 = loc("block_n_end"(#loc133))
607
+ #loc291 = loc("block_n_end"(#loc134))
608
+ #loc292 = loc("offs_n"(#loc135))
609
+ #loc294 = loc(callsite(#loc127 at #loc137))
610
+ #loc295 = loc("l_i"(#loc138))
611
+ #loc296 = loc("acc"(#loc139))
612
+ #loc297 = loc("acc"(#loc140))
613
+ #loc298 = loc("mask"(#loc141))
614
+ #loc299 = loc("off_hz"(#loc146))
615
+ #loc300 = loc("off_hz"(#loc147))
616
+ #loc301 = loc("l_ptrs"(#loc148))
617
+ #loc302 = loc("l_ptrs"(#loc149))
618
+ #loc303 = loc("l_ptrs"(#loc150))
619
+ #loc304 = loc("lse"(#loc151))
620
+ #loc305 = loc("lse"(#loc152))
621
+ #loc306 = loc(callsite(#loc3 at #loc172))
622
+ #loc307 = loc(callsite(#loc5 at #loc173))
623
+ #loc308 = loc(callsite(#loc192 at #loc172))
624
+ #loc309 = loc(callsite(#loc193 at #loc172))
625
+ #loc310 = loc(callsite(#loc194 at #loc172))
626
+ #loc311 = loc(callsite(#loc195 at #loc172))
627
+ #loc312 = loc(callsite(#loc196 at #loc172))
628
+ #loc313 = loc(callsite(#loc31 at #loc172))
629
+ #loc314 = loc(callsite(#loc38 at #loc203))
630
+ #loc315 = loc(callsite(#loc40 at #loc203))
631
+ #loc316 = loc("l_i"(#loc208))
632
+ #loc317 = loc(callsite(#loc209 at #loc210))
633
+ #loc318 = loc(callsite(#loc211 at #loc210))
634
+ #loc319 = loc(callsite(#loc212 at #loc210))
635
+ #loc320 = loc(callsite(#loc213 at #loc210))
636
+ #loc321 = loc(callsite(#loc214 at #loc210))
637
+ #loc322 = loc(callsite(#loc215 at #loc210))
638
+ #loc323 = loc(callsite(#loc216 at #loc210))
639
+ #loc324 = loc(callsite(#loc217 at #loc210))
640
+ #loc325 = loc(callsite(#loc218 at #loc210))
641
+ #loc326 = loc(callsite(#loc219 at #loc210))
642
+ #loc327 = loc(callsite(#loc220 at #loc210))
643
+ #loc328 = loc(callsite(#loc221 at #loc210))
644
+ #loc329 = loc(callsite(#loc222 at #loc210))
645
+ #loc330 = loc(callsite(#loc223 at #loc210))
646
+ #loc331 = loc(callsite(#loc224 at #loc210))
647
+ #loc332 = loc(callsite(#loc225 at #loc210))
648
+ #loc333 = loc(callsite(#loc226 at #loc210))
649
+ #loc334 = loc(callsite(#loc227 at #loc210))
650
+ #loc335 = loc(callsite(#loc228 at #loc210))
651
+ #loc336 = loc(callsite(#loc229 at #loc210))
652
+ #loc337 = loc(callsite(#loc230 at #loc210))
653
+ #loc338 = loc(callsite(#loc231 at #loc210))
654
+ #loc339 = loc(callsite(#loc232 at #loc210))
655
+ #loc340 = loc(callsite(#loc233 at #loc210))
656
+ #loc341 = loc(callsite(#loc234 at #loc210))
657
+ #loc342 = loc(callsite(#loc235 at #loc210))
658
+ #loc343 = loc(callsite(#loc236 at #loc210))
659
+ #loc344 = loc(callsite(#loc237 at #loc210))
660
+ #loc345 = loc(callsite(#loc238 at #loc210))
661
+ #loc346 = loc(callsite(#loc239 at #loc210))
662
+ #loc347 = loc(callsite(#loc240 at #loc210))
663
+ #loc348 = loc(callsite(#loc241 at #loc210))
664
+ #loc349 = loc(callsite(#loc242 at #loc210))
665
+ #loc350 = loc(callsite(#loc243 at #loc210))
666
+ #loc351 = loc(callsite(#loc244 at #loc210))
667
+ #loc352 = loc(callsite(#loc245 at #loc210))
668
+ #loc353 = loc(callsite(#loc246 at #loc210))
669
+ #loc355 = loc(callsite(#loc248 at #loc210))
670
+ #loc356 = loc(callsite(#loc249 at #loc210))
671
+ #loc357 = loc(callsite(#loc250 at #loc210))
672
+ #loc358 = loc(callsite(#loc251 at #loc210))
673
+ #loc359 = loc(callsite(#loc252 at #loc210))
674
+ #loc360 = loc(callsite(#loc253 at #loc210))
675
+ #loc361 = loc(callsite(#loc254 at #loc210))
676
+ #loc362 = loc(callsite(#loc255 at #loc210))
677
+ #loc363 = loc(callsite(#loc256 at #loc210))
678
+ #loc365 = loc(callsite(#loc258 at #loc210))
679
+ #loc366 = loc(callsite(#loc259 at #loc210))
680
+ #loc367 = loc(callsite(#loc260 at #loc210))
681
+ #loc368 = loc(callsite(#loc261 at #loc210))
682
+ #loc369 = loc(callsite(#loc262 at #loc210))
683
+ #loc370 = loc(callsite(#loc263 at #loc210))
684
+ #loc371 = loc(callsite(#loc265 at #loc2))
685
+ #loc372 = loc(callsite(#loc282 at #loc2))
686
+ #loc373 = loc(callsite(#loc283 at #loc2))
687
+ #loc374 = loc(callsite(#loc209 at #loc293))
688
+ #loc375 = loc(callsite(#loc211 at #loc293))
689
+ #loc376 = loc(callsite(#loc212 at #loc293))
690
+ #loc377 = loc(callsite(#loc213 at #loc293))
691
+ #loc378 = loc(callsite(#loc214 at #loc293))
692
+ #loc379 = loc(callsite(#loc215 at #loc293))
693
+ #loc380 = loc(callsite(#loc218 at #loc293))
694
+ #loc381 = loc(callsite(#loc219 at #loc293))
695
+ #loc382 = loc(callsite(#loc246 at #loc293))
696
+ #loc384 = loc(callsite(#loc248 at #loc293))
697
+ #loc385 = loc(callsite(#loc249 at #loc293))
698
+ #loc386 = loc(callsite(#loc250 at #loc293))
699
+ #loc387 = loc(callsite(#loc251 at #loc293))
700
+ #loc388 = loc(callsite(#loc252 at #loc293))
701
+ #loc389 = loc(callsite(#loc253 at #loc293))
702
+ #loc390 = loc(callsite(#loc254 at #loc293))
703
+ #loc391 = loc(callsite(#loc255 at #loc293))
704
+ #loc392 = loc(callsite(#loc256 at #loc293))
705
+ #loc394 = loc(callsite(#loc258 at #loc293))
706
+ #loc395 = loc(callsite(#loc259 at #loc293))
707
+ #loc396 = loc(callsite(#loc260 at #loc293))
708
+ #loc397 = loc(callsite(#loc261 at #loc293))
709
+ #loc398 = loc(callsite(#loc262 at #loc293))
710
+ #loc399 = loc(callsite(#loc263 at #loc293))
711
+ #loc400 = loc(callsite(#loc265 at #loc137))
712
+ #loc401 = loc(callsite(#loc282 at #loc137))
713
+ #loc402 = loc(callsite(#loc283 at #loc137))
714
+ #loc403 = loc("m_i"(#loc316))
715
+ #loc404 = loc(callsite(#loc192 at #loc319))
716
+ #loc405 = loc(callsite(#loc193 at #loc319))
717
+ #loc406 = loc(callsite(#loc194 at #loc319))
718
+ #loc407 = loc(callsite(#loc196 at #loc319))
719
+ #loc408 = loc(callsite(#loc31 at #loc319))
720
+ #loc409 = loc(callsite(#loc3 at #loc319))
721
+ #loc410 = loc(callsite(#loc54 at #loc323))
722
+ #loc411 = loc(callsite(#loc54 at #loc324))
723
+ #loc412 = loc(callsite(#loc86 at #loc354))
724
+ #loc414 = loc(callsite(#loc98 at #loc364))
725
+ #loc416 = loc(callsite(#loc194 at #loc368))
726
+ #loc417 = loc(callsite(#loc196 at #loc368))
727
+ #loc418 = loc(callsite(#loc3 at #loc368))
728
+ #loc419 = loc(callsite(#loc264 at #loc371))
729
+ #loc420 = loc(callsite(#loc266 at #loc371))
730
+ #loc421 = loc(callsite(#loc267 at #loc371))
731
+ #loc422 = loc(callsite(#loc268 at #loc371))
732
+ #loc423 = loc(callsite(#loc269 at #loc371))
733
+ #loc424 = loc(callsite(#loc270 at #loc371))
734
+ #loc425 = loc(callsite(#loc271 at #loc371))
735
+ #loc426 = loc(callsite(#loc272 at #loc371))
736
+ #loc427 = loc(callsite(#loc273 at #loc371))
737
+ #loc428 = loc(callsite(#loc274 at #loc371))
738
+ #loc429 = loc(callsite(#loc275 at #loc371))
739
+ #loc430 = loc(callsite(#loc276 at #loc371))
740
+ #loc431 = loc(callsite(#loc277 at #loc371))
741
+ #loc432 = loc(callsite(#loc278 at #loc371))
742
+ #loc433 = loc(callsite(#loc279 at #loc371))
743
+ #loc434 = loc(callsite(#loc280 at #loc371))
744
+ #loc435 = loc(callsite(#loc281 at #loc371))
745
+ #loc436 = loc(callsite(#loc192 at #loc376))
746
+ #loc437 = loc(callsite(#loc193 at #loc376))
747
+ #loc438 = loc(callsite(#loc194 at #loc376))
748
+ #loc439 = loc(callsite(#loc196 at #loc376))
749
+ #loc440 = loc(callsite(#loc31 at #loc376))
750
+ #loc441 = loc(callsite(#loc3 at #loc376))
751
+ #loc442 = loc(callsite(#loc86 at #loc383))
752
+ #loc444 = loc(callsite(#loc98 at #loc393))
753
+ #loc446 = loc(callsite(#loc194 at #loc397))
754
+ #loc447 = loc(callsite(#loc196 at #loc397))
755
+ #loc448 = loc(callsite(#loc3 at #loc397))
756
+ #loc449 = loc(callsite(#loc264 at #loc400))
757
+ #loc450 = loc(callsite(#loc266 at #loc400))
758
+ #loc451 = loc(callsite(#loc267 at #loc400))
759
+ #loc452 = loc(callsite(#loc268 at #loc400))
760
+ #loc453 = loc(callsite(#loc269 at #loc400))
761
+ #loc454 = loc(callsite(#loc270 at #loc400))
762
+ #loc455 = loc(callsite(#loc271 at #loc400))
763
+ #loc456 = loc(callsite(#loc272 at #loc400))
764
+ #loc457 = loc(callsite(#loc273 at #loc400))
765
+ #loc458 = loc(callsite(#loc274 at #loc400))
766
+ #loc459 = loc(callsite(#loc275 at #loc400))
767
+ #loc460 = loc(callsite(#loc276 at #loc400))
768
+ #loc461 = loc(callsite(#loc277 at #loc400))
769
+ #loc462 = loc(callsite(#loc278 at #loc400))
770
+ #loc463 = loc(callsite(#loc279 at #loc400))
771
+ #loc464 = loc(callsite(#loc280 at #loc400))
772
+ #loc465 = loc(callsite(#loc281 at #loc400))
773
+ #loc466 = loc("offs_n"(#loc403))
774
+ #loc467 = loc(callsite(#loc88 at #loc412))
775
+ #loc468 = loc(callsite(#loc100 at #loc414))
776
+ #loc469 = loc(callsite(#loc88 at #loc442))
777
+ #loc470 = loc(callsite(#loc100 at #loc444))
778
+ #loc471 = loc("kv_offset"(#loc466))
779
+ #loc472 = loc(callsite(#loc471 at #loc2))
780
+ #loc473 = loc(callsite(#loc471 at #loc137))
progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/__grp__triton_poi_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_poi_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source", "triton_poi_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir", "triton_poi_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir", "triton_poi_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir", "triton_poi_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx", "triton_poi_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin", "triton_poi_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json"}}
progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin ADDED
Binary file (10.3 kB). View file
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "d650530c018e98a61be4958bef98391bf5f4932885981bbdd4c94cc375f6e8e4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_1"}
progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ ; Function Attrs: nounwind
6
+ define ptx_kernel void @triton_poi_fused_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
7
+ %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
8
+ %8 = shl i32 %7, 7, !dbg !8
9
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
10
+ %10 = and i32 %9, 127, !dbg !9
11
+ %11 = or disjoint i32 %8, %10, !dbg !10
12
+ %12 = icmp slt i32 %11, %3, !dbg !11
13
+ %13 = sext i32 %11 to i64, !dbg !12
14
+ %.frozen = freeze i64 %2, !dbg !13
15
+ %14 = sdiv i64 %13, %.frozen, !dbg !13
16
+ %15 = mul i64 %14, %.frozen, !dbg !12
17
+ %.decomposed = sub i64 %13, %15, !dbg !12
18
+ %.not = icmp ne i64 %.decomposed, 0, !dbg !17
19
+ %16 = icmp slt i32 %8, 0, !dbg !18
20
+ %17 = icmp slt i64 %2, 0, !dbg !19
21
+ %18 = xor i1 %16, %17, !dbg !20
22
+ %narrow = select i1 %18, i1 %.not, i1 false, !dbg !21
23
+ %19 = sext i1 %narrow to i64, !dbg !21
24
+ %20 = add nsw i64 %14, %19, !dbg !21
25
+ %21 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !22
26
+ %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !23
27
+ %23 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %21, i64 %22, i1 %12) #2, !dbg !23
28
+ %24 = bitcast i32 %23 to float, !dbg !23
29
+ %25 = fmul float %24, 0x3FE62E4300000000, !dbg !24
30
+ %26 = icmp slt i64 %2, 2, !dbg !25
31
+ %27 = icmp sgt i64 %2, 1, !dbg !26
32
+ %28 = select i1 %27, i64 %2, i64 0, !dbg !27
33
+ %29 = zext i1 %26 to i64, !dbg !28
34
+ %30 = add i64 %28, %29, !dbg !29
35
+ %31 = mul i64 %20, %30, !dbg !30
36
+ %32 = getelementptr float, ptr addrspace(1) %1, i64 %.decomposed, !dbg !31
37
+ %33 = getelementptr float, ptr addrspace(1) %32, i64 %31, !dbg !31
38
+ %34 = bitcast float %25 to i32, !dbg !32
39
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %34, ptr addrspace(1) %33, i1 %12) #2, !dbg !32
40
+ ret void, !dbg !33
41
+ }
42
+
43
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
44
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
45
+
46
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
47
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
48
+
49
+ attributes #0 = { nounwind "nvvm.reqntid"="128" }
50
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
51
+ attributes #2 = { nounwind }
52
+
53
+ !llvm.dbg.cu = !{!0}
54
+ !llvm.module.flags = !{!2, !3}
55
+
56
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
57
+ !1 = !DIFile(filename: "csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz")
58
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
59
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
60
+ !4 = distinct !DISubprogram(name: "triton_poi_fused_mul_1", linkageName: "triton_poi_fused_mul_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
61
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
62
+ !6 = !{}
63
+ !7 = !DILocation(line: 19, column: 28, scope: !4)
64
+ !8 = !DILocation(line: 19, column: 33, scope: !4)
65
+ !9 = !DILocation(line: 20, column: 36, scope: !4)
66
+ !10 = !DILocation(line: 20, column: 23, scope: !4)
67
+ !11 = !DILocation(line: 21, column: 21, scope: !4)
68
+ !12 = !DILocation(line: 23, column: 19, scope: !4)
69
+ !13 = !DILocation(line: 72, column: 16, scope: !14, inlinedAt: !16)
70
+ !14 = distinct !DILexicalBlockFile(scope: !4, file: !15, discriminator: 0)
71
+ !15 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
72
+ !16 = !DILocation(line: 24, column: 51, scope: !4)
73
+ !17 = !DILocation(line: 74, column: 34, scope: !14, inlinedAt: !16)
74
+ !18 = !DILocation(line: 75, column: 25, scope: !14, inlinedAt: !16)
75
+ !19 = !DILocation(line: 75, column: 36, scope: !14, inlinedAt: !16)
76
+ !20 = !DILocation(line: 75, column: 32, scope: !14, inlinedAt: !16)
77
+ !21 = !DILocation(line: 75, column: 47, scope: !14, inlinedAt: !16)
78
+ !22 = !DILocation(line: 25, column: 30, scope: !4)
79
+ !23 = !DILocation(line: 25, column: 35, scope: !4)
80
+ !24 = !DILocation(line: 27, column: 18, scope: !4)
81
+ !25 = !DILocation(line: 28, column: 49, scope: !4)
82
+ !26 = !DILocation(line: 28, column: 75, scope: !4)
83
+ !27 = !DILocation(line: 28, column: 66, scope: !4)
84
+ !28 = !DILocation(line: 28, scope: !4)
85
+ !29 = !DILocation(line: 28, column: 57, scope: !4)
86
+ !30 = !DILocation(line: 28, column: 34, scope: !4)
87
+ !31 = !DILocation(line: 28, column: 25, scope: !4)
88
+ !32 = !DILocation(line: 28, column: 88, scope: !4)
89
+ !33 = !DILocation(line: 28, column: 4, scope: !4)
progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_poi_fused_mul_1 // -- Begin function triton_poi_fused_mul_1
10
+ // @triton_poi_fused_mul_1
11
+ .visible .entry triton_poi_fused_mul_1(
12
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_0,
13
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_1,
14
+ .param .u64 triton_poi_fused_mul_1_param_2,
15
+ .param .u32 triton_poi_fused_mul_1_param_3,
16
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_4,
17
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_5
18
+ )
19
+ .reqntid 128
20
+ {
21
+ .reg .pred %p<11>;
22
+ .reg .b32 %r<13>;
23
+ .reg .b64 %rd<30>;
24
+ .loc 1 18 0 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:18:0
25
+ $L__func_begin0:
26
+ .loc 1 18 0 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:18:0
27
+
28
+ // %bb.0:
29
+ ld.param.b32 %r2, [triton_poi_fused_mul_1_param_3];
30
+ ld.param.b64 %rd7, [triton_poi_fused_mul_1_param_1];
31
+ ld.param.b64 %rd6, [triton_poi_fused_mul_1_param_0];
32
+ ld.param.b64 %rd8, [triton_poi_fused_mul_1_param_2];
33
+ $L__tmp0:
34
+ .loc 1 19 28 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:19:28
35
+ mov.u32 %r3, %ctaid.x;
36
+ .loc 1 19 33 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:19:33
37
+ shl.b32 %r1, %r3, 7;
38
+ .loc 1 20 36 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:20:36
39
+ mov.u32 %r4, %tid.x;
40
+ and.b32 %r5, %r4, 127;
41
+ .loc 1 20 23 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:20:23
42
+ or.b32 %r6, %r1, %r5;
43
+ .loc 1 23 19 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:23:19
44
+ cvt.s64.s32 %rd1, %r6;
45
+ $L__tmp1:
46
+ .loc 2 72 16 // triton_helpers.py:72:16 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
47
+ or.b64 %rd10, %rd1, %rd8;
48
+ and.b64 %rd11, %rd10, -4294967296;
49
+ setp.ne.b64 %p1, %rd11, 0;
50
+ @%p1 bra $L__BB0_2;
51
+ bra.uni $L__BB0_1;
52
+ $L__BB0_2:
53
+ div.s64 %rd29, %rd1, %rd8;
54
+ bra.uni $L__BB0_3;
55
+ $L__BB0_1:
56
+ cvt.u32.u64 %r7, %rd8;
57
+ cvt.u32.u64 %r8, %rd1;
58
+ div.u32 %r9, %r8, %r7;
59
+ cvt.u64.u32 %rd29, %r9;
60
+ $L__tmp2:
61
+ $L__BB0_3:
62
+ .loc 2 0 16 // triton_helpers.py:0:16
63
+ cvt.u32.u64 %r12, %rd1;
64
+ .loc 1 21 21 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:21:21
65
+ setp.lt.s32 %p2, %r12, %r2;
66
+ .loc 1 23 19 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:23:19
67
+ mul.lo.s64 %rd17, %rd29, %rd8;
68
+ sub.s64 %rd18, %rd1, %rd17;
69
+ $L__tmp3:
70
+ .loc 2 74 34 // triton_helpers.py:74:34 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
71
+ setp.ne.b64 %p4, %rd18, 0;
72
+ .loc 2 75 25 // triton_helpers.py:75:25 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
73
+ setp.lt.s32 %p5, %r1, 0;
74
+ .loc 2 75 36 // triton_helpers.py:75:36 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
75
+ setp.lt.s64 %p6, %rd8, 0;
76
+ .loc 2 75 32 // triton_helpers.py:75:32 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
77
+ xor.pred %p7, %p5, %p6;
78
+ .loc 2 75 47 // triton_helpers.py:75:47 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
79
+ and.pred %p8, %p7, %p4;
80
+ selp.b64 %rd19, -1, 0, %p8;
81
+ add.s64 %rd20, %rd29, %rd19;
82
+ $L__tmp4:
83
+ .loc 1 25 30 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:25:30
84
+ shl.b64 %rd21, %rd1, 2;
85
+ add.s64 %rd13, %rd6, %rd21;
86
+ .loc 1 25 35 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:25:35
87
+ // begin inline asm
88
+ mov.u64 %rd14, 0x0;
89
+ createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
90
+ // end inline asm
91
+ // begin inline asm
92
+ mov.u32 %r10, 0x0;
93
+ @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10 }, [ %rd13 + 0 ], %rd14;
94
+ // end inline asm
95
+ .loc 1 27 18 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:27:18
96
+ mul.f32 %r11, %r10, 0f3F317218;
97
+ .loc 1 28 49 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:49
98
+ setp.lt.s64 %p9, %rd8, 2;
99
+ .loc 1 28 75 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:75
100
+ setp.gt.s64 %p10, %rd8, 1;
101
+ .loc 1 28 66 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:66
102
+ selp.b64 %rd22, %rd8, 0, %p10;
103
+ .loc 1 28 0 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28
104
+ selp.b64 %rd23, 1, 0, %p9;
105
+ .loc 1 28 57 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:57
106
+ add.s64 %rd24, %rd22, %rd23;
107
+ .loc 1 28 34 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:34
108
+ mul.lo.s64 %rd25, %rd20, %rd24;
109
+ .loc 1 28 25 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:25
110
+ shl.b64 %rd26, %rd18, 2;
111
+ add.s64 %rd27, %rd7, %rd26;
112
+ shl.b64 %rd28, %rd25, 2;
113
+ add.s64 %rd15, %rd27, %rd28;
114
+ .loc 1 28 88 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:88
115
+ // begin inline asm
116
+ @%p2 st.global.b32 [ %rd15 + 0 ], { %r11 };
117
+ // end inline asm
118
+ .loc 1 28 4 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:4
119
+ ret;
120
+ $L__tmp5:
121
+ $L__func_end0:
122
+ // -- End function
123
+ }
124
+ .file 1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py"
125
+ .file 2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
126
+ .section .debug_abbrev
127
+ {
128
+ .b8 1 // Abbreviation Code
129
+ .b8 17 // DW_TAG_compile_unit
130
+ .b8 1 // DW_CHILDREN_yes
131
+ .b8 37 // DW_AT_producer
132
+ .b8 8 // DW_FORM_string
133
+ .b8 19 // DW_AT_language
134
+ .b8 5 // DW_FORM_data2
135
+ .b8 3 // DW_AT_name
136
+ .b8 8 // DW_FORM_string
137
+ .b8 16 // DW_AT_stmt_list
138
+ .b8 6 // DW_FORM_data4
139
+ .b8 27 // DW_AT_comp_dir
140
+ .b8 8 // DW_FORM_string
141
+ .b8 0 // EOM(1)
142
+ .b8 0 // EOM(2)
143
+ .b8 2 // Abbreviation Code
144
+ .b8 46 // DW_TAG_subprogram
145
+ .b8 0 // DW_CHILDREN_no
146
+ .b8 3 // DW_AT_name
147
+ .b8 8 // DW_FORM_string
148
+ .b8 32 // DW_AT_inline
149
+ .b8 11 // DW_FORM_data1
150
+ .b8 0 // EOM(1)
151
+ .b8 0 // EOM(2)
152
+ .b8 3 // Abbreviation Code
153
+ .b8 46 // DW_TAG_subprogram
154
+ .b8 1 // DW_CHILDREN_yes
155
+ .b8 17 // DW_AT_low_pc
156
+ .b8 1 // DW_FORM_addr
157
+ .b8 18 // DW_AT_high_pc
158
+ .b8 1 // DW_FORM_addr
159
+ .b8 49 // DW_AT_abstract_origin
160
+ .b8 19 // DW_FORM_ref4
161
+ .b8 0 // EOM(1)
162
+ .b8 0 // EOM(2)
163
+ .b8 4 // Abbreviation Code
164
+ .b8 29 // DW_TAG_inlined_subroutine
165
+ .b8 0 // DW_CHILDREN_no
166
+ .b8 49 // DW_AT_abstract_origin
167
+ .b8 19 // DW_FORM_ref4
168
+ .b8 17 // DW_AT_low_pc
169
+ .b8 1 // DW_FORM_addr
170
+ .b8 18 // DW_AT_high_pc
171
+ .b8 1 // DW_FORM_addr
172
+ .b8 88 // DW_AT_call_file
173
+ .b8 11 // DW_FORM_data1
174
+ .b8 89 // DW_AT_call_line
175
+ .b8 11 // DW_FORM_data1
176
+ .b8 87 // DW_AT_call_column
177
+ .b8 11 // DW_FORM_data1
178
+ .b8 0 // EOM(1)
179
+ .b8 0 // EOM(2)
180
+ .b8 0 // EOM(3)
181
+ }
182
+ .section .debug_info
183
+ {
184
+ .b32 211 // Length of Unit
185
+ .b8 2 // DWARF version number
186
+ .b8 0
187
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
188
+ .b8 8 // Address Size (in bytes)
189
+ .b8 1 // Abbrev [1] 0xb:0xcc DW_TAG_compile_unit
190
+ .b8 116 // DW_AT_producer
191
+ .b8 114
192
+ .b8 105
193
+ .b8 116
194
+ .b8 111
195
+ .b8 110
196
+ .b8 0
197
+ .b8 2 // DW_AT_language
198
+ .b8 0
199
+ .b8 99 // DW_AT_name
200
+ .b8 115
201
+ .b8 122
202
+ .b8 52
203
+ .b8 111
204
+ .b8 121
205
+ .b8 113
206
+ .b8 121
207
+ .b8 97
208
+ .b8 52
209
+ .b8 116
210
+ .b8 114
211
+ .b8 99
212
+ .b8 53
213
+ .b8 108
214
+ .b8 108
215
+ .b8 53
216
+ .b8 53
217
+ .b8 122
218
+ .b8 106
219
+ .b8 103
220
+ .b8 101
221
+ .b8 50
222
+ .b8 106
223
+ .b8 112
224
+ .b8 98
225
+ .b8 102
226
+ .b8 110
227
+ .b8 112
228
+ .b8 110
229
+ .b8 51
230
+ .b8 52
231
+ .b8 99
232
+ .b8 105
233
+ .b8 52
234
+ .b8 54
235
+ .b8 50
236
+ .b8 107
237
+ .b8 110
238
+ .b8 103
239
+ .b8 108
240
+ .b8 53
241
+ .b8 53
242
+ .b8 98
243
+ .b8 105
244
+ .b8 54
245
+ .b8 54
246
+ .b8 106
247
+ .b8 111
248
+ .b8 107
249
+ .b8 99
250
+ .b8 120
251
+ .b8 46
252
+ .b8 112
253
+ .b8 121
254
+ .b8 0
255
+ .b32 .debug_line // DW_AT_stmt_list
256
+ .b8 47 // DW_AT_comp_dir
257
+ .b8 119
258
+ .b8 111
259
+ .b8 114
260
+ .b8 107
261
+ .b8 115
262
+ .b8 112
263
+ .b8 97
264
+ .b8 99
265
+ .b8 101
266
+ .b8 47
267
+ .b8 104
268
+ .b8 97
269
+ .b8 110
270
+ .b8 114
271
+ .b8 117
272
+ .b8 105
273
+ .b8 47
274
+ .b8 106
275
+ .b8 117
276
+ .b8 110
277
+ .b8 113
278
+ .b8 117
279
+ .b8 97
280
+ .b8 110
281
+ .b8 47
282
+ .b8 83
283
+ .b8 112
284
+ .b8 101
285
+ .b8 99
286
+ .b8 70
287
+ .b8 111
288
+ .b8 114
289
+ .b8 103
290
+ .b8 101
291
+ .b8 47
292
+ .b8 99
293
+ .b8 97
294
+ .b8 99
295
+ .b8 104
296
+ .b8 101
297
+ .b8 47
298
+ .b8 99
299
+ .b8 111
300
+ .b8 109
301
+ .b8 112
302
+ .b8 105
303
+ .b8 108
304
+ .b8 101
305
+ .b8 100
306
+ .b8 95
307
+ .b8 107
308
+ .b8 101
309
+ .b8 114
310
+ .b8 110
311
+ .b8 101
312
+ .b8 108
313
+ .b8 115
314
+ .b8 47
315
+ .b8 115
316
+ .b8 122
317
+ .b8 0
318
+ .b8 2 // Abbrev [2] 0x8f:0x19 DW_TAG_subprogram
319
+ .b8 116 // DW_AT_name
320
+ .b8 114
321
+ .b8 105
322
+ .b8 116
323
+ .b8 111
324
+ .b8 110
325
+ .b8 95
326
+ .b8 112
327
+ .b8 111
328
+ .b8 105
329
+ .b8 95
330
+ .b8 102
331
+ .b8 117
332
+ .b8 115
333
+ .b8 101
334
+ .b8 100
335
+ .b8 95
336
+ .b8 109
337
+ .b8 117
338
+ .b8 108
339
+ .b8 95
340
+ .b8 49
341
+ .b8 0
342
+ .b8 1 // DW_AT_inline
343
+ .b8 3 // Abbrev [3] 0xa8:0x2e DW_TAG_subprogram
344
+ .b64 $L__func_begin0 // DW_AT_low_pc
345
+ .b64 $L__func_end0 // DW_AT_high_pc
346
+ .b32 143 // DW_AT_abstract_origin
347
+ .b8 4 // Abbrev [4] 0xbd:0x18 DW_TAG_inlined_subroutine
348
+ .b32 143 // DW_AT_abstract_origin
349
+ .b64 $L__tmp1 // DW_AT_low_pc
350
+ .b64 $L__tmp4 // DW_AT_high_pc
351
+ .b8 1 // DW_AT_call_file
352
+ .b8 24 // DW_AT_call_line
353
+ .b8 51 // DW_AT_call_column
354
+ .b8 0 // End Of Children Mark
355
+ .b8 0 // End Of Children Mark
356
+ }
357
+ .section .debug_macinfo { }
progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":18:0)
2
+ #loc22 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0)
3
+ #loc34 = loc("in_ptr0"(#loc))
4
+ #loc35 = loc("out_ptr0"(#loc))
5
+ #loc36 = loc("ks0"(#loc))
6
+ #loc37 = loc("xnumel"(#loc))
7
+ #loc49 = loc("a"(#loc22))
8
+ #loc50 = loc("b"(#loc22))
9
+ module {
10
+ tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
11
+ %xoffset = tt.get_program_id x : i32 loc(#loc38)
12
+ %xoffset_0 = arith.constant 128 : i32 loc(#loc39)
13
+ %xoffset_1 = arith.constant 128 : i32 loc(#loc39)
14
+ %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc39)
15
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc40)
16
+ %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc41)
17
+ %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc41)
18
+ %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc42)
19
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc42)
20
+ %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc43)
21
+ %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc43)
22
+ %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc43)
23
+ %x1 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128S_i64__(%xindex_4, %ks0) : (tensor<128xi32>, i64) -> tensor<128xi64> loc(#loc44)
24
+ %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc45)
25
+ %tmp0_8 = tt.addptr %tmp0, %xindex_4 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc45)
26
+ %tmp0_9 = tt.load %tmp0_8, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>> loc(#loc46)
27
+ %tmp1 = arith.constant 0.693147182 : f32 loc(#loc47)
28
+ %tmp2 = arith.constant dense<0.693147182> : tensor<128xf32> loc(#loc48)
29
+ %tmp2_10 = arith.mulf %tmp0_9, %tmp2 : tensor<128xf32> loc(#loc48)
30
+ %c1_i32 = arith.constant 1 : i32 loc(#loc12)
31
+ %0 = arith.extsi %c1_i32 : i32 to i64 loc(#loc12)
32
+ %1 = arith.cmpi sge, %0, %ks0 : i64 loc(#loc12)
33
+ %c1_i32_11 = arith.constant 1 : i32 loc(#loc13)
34
+ %c1_i32_12 = arith.constant 1 : i32 loc(#loc13)
35
+ %2 = arith.extui %1 : i1 to i32 loc(#loc13)
36
+ %3 = arith.muli %c1_i32_12, %2 : i32 loc(#loc13)
37
+ %c1_i32_13 = arith.constant 1 : i32 loc(#loc14)
38
+ %4 = arith.extsi %c1_i32_13 : i32 to i64 loc(#loc14)
39
+ %5 = arith.cmpi sgt, %ks0, %4 : i64 loc(#loc14)
40
+ %6 = arith.extui %5 : i1 to i64 loc(#loc15)
41
+ %7 = arith.muli %ks0, %6 : i64 loc(#loc15)
42
+ %8 = arith.extsi %3 : i32 to i64 loc(#loc16)
43
+ %9 = arith.addi %8, %7 : i64 loc(#loc16)
44
+ %10 = tt.splat %9 : i64 -> tensor<128xi64> loc(#loc17)
45
+ %11 = arith.muli %x1, %10 : tensor<128xi64> loc(#loc17)
46
+ %12 = arith.addi %x0_7, %11 : tensor<128xi64> loc(#loc18)
47
+ %13 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc19)
48
+ %14 = tt.addptr %13, %12 : tensor<128x!tt.ptr<f32>>, tensor<128xi64> loc(#loc19)
49
+ tt.store %14, %tmp2_10, %xmask_5 : tensor<128x!tt.ptr<f32>> loc(#loc20)
50
+ tt.return loc(#loc21)
51
+ } loc(#loc)
52
+ tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128S_i64__(%a: tensor<128xi32> loc("a"(#loc22)), %b: i64 loc("b"(#loc22))) -> tensor<128xi64> attributes {noinline = false} {
53
+ %quot = arith.extsi %a : tensor<128xi32> to tensor<128xi64> loc(#loc51)
54
+ %quot_0 = tt.splat %b : i64 -> tensor<128xi64> loc(#loc51)
55
+ %quot_1 = arith.divsi %quot, %quot_0 : tensor<128xi64> loc(#loc51)
56
+ %remainder = arith.extsi %a : tensor<128xi32> to tensor<128xi64> loc(#loc52)
57
+ %remainder_2 = tt.splat %b : i64 -> tensor<128xi64> loc(#loc52)
58
+ %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<128xi64> loc(#loc52)
59
+ %fixed = arith.constant 0 : i32 loc(#loc53)
60
+ %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc53)
61
+ %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<128xi64> loc(#loc53)
62
+ %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<128xi64> loc(#loc53)
63
+ %fixed_7 = arith.constant 1 : i32 loc(#loc54)
64
+ %fixed_8 = arith.constant 1 : i64 loc(#loc54)
65
+ %fixed_9 = arith.constant dense<1> : tensor<128xi64> loc(#loc54)
66
+ %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<128xi64> loc(#loc54)
67
+ %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<128xi1>, tensor<128xi64> loc(#loc55)
68
+ %c0_i32 = arith.constant 0 : i32 loc(#loc28)
69
+ %cst = arith.constant dense<0> : tensor<128xi32> loc(#loc28)
70
+ %0 = arith.cmpi slt, %a, %cst : tensor<128xi32> loc(#loc28)
71
+ %c0_i32_12 = arith.constant 0 : i32 loc(#loc29)
72
+ %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc29)
73
+ %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc29)
74
+ %3 = tt.splat %2 : i1 -> tensor<128xi1> loc(#loc30)
75
+ %4 = arith.cmpi ne, %0, %3 : tensor<128xi1> loc(#loc30)
76
+ %5 = arith.select %4, %fixed_11, %quot_1 : tensor<128xi1>, tensor<128xi64> loc(#loc31)
77
+ tt.return %5 : tensor<128xi64> loc(#loc32)
78
+ ^bb1: // no predecessors
79
+ %6 = ub.poison : tensor<128xi64> loc(#loc33)
80
+ tt.return %6 : tensor<128xi64> loc(#loc33)
81
+ } loc(#loc22)
82
+ } loc(#loc)
83
+ #loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:28)
84
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:33)
85
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:36)
86
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:23)
87
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":21:21)
88
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":23:19)
89
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":24:51)
90
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:30)
91
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:35)
92
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":26:11)
93
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":27:18)
94
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:49)
95
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:41)
96
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:75)
97
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:66)
98
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:57)
99
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:34)
100
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:30)
101
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:25)
102
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:88)
103
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:4)
104
+ #loc23 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
105
+ #loc24 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20)
106
+ #loc25 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
107
+ #loc26 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
108
+ #loc27 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
109
+ #loc28 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
110
+ #loc29 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
111
+ #loc30 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
112
+ #loc31 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
113
+ #loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11)
114
+ #loc33 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4)
115
+ #loc38 = loc("xoffset"(#loc1))
116
+ #loc39 = loc("xoffset"(#loc2))
117
+ #loc40 = loc("xindex"(#loc3))
118
+ #loc41 = loc("xindex"(#loc4))
119
+ #loc42 = loc("xmask"(#loc5))
120
+ #loc43 = loc("x0"(#loc6))
121
+ #loc44 = loc("x1"(#loc7))
122
+ #loc45 = loc("tmp0"(#loc8))
123
+ #loc46 = loc("tmp0"(#loc9))
124
+ #loc47 = loc("tmp1"(#loc10))
125
+ #loc48 = loc("tmp2"(#loc11))
126
+ #loc51 = loc("quot"(#loc23))
127
+ #loc52 = loc("remainder"(#loc24))
128
+ #loc53 = loc("fixed"(#loc25))
129
+ #loc54 = loc("fixed"(#loc26))
130
+ #loc55 = loc("fixed"(#loc27))
progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
2
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":18:0)
3
+ #loc30 = loc("in_ptr0"(#loc))
4
+ #loc31 = loc("out_ptr0"(#loc))
5
+ #loc32 = loc("ks0"(#loc))
6
+ #loc33 = loc("xnumel"(#loc))
7
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
8
+ tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
9
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
10
+ %cst = arith.constant dense<0.693147182> : tensor<128xf32, #blocked> loc(#loc1)
11
+ %c1_i64 = arith.constant 1 : i64 loc(#loc1)
12
+ %c0_i64 = arith.constant 0 : i64 loc(#loc1)
13
+ %cst_0 = arith.constant dense<0> : tensor<128xi64, #blocked> loc(#loc1)
14
+ %cst_1 = arith.constant dense<0> : tensor<128xi32, #blocked> loc(#loc1)
15
+ %cst_2 = arith.constant dense<1> : tensor<128xi64, #blocked> loc(#loc1)
16
+ %xoffset = tt.get_program_id x : i32 loc(#loc34)
17
+ %xoffset_3 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc35)
18
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc36)
19
+ %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<128xi32, #blocked> loc(#loc37)
20
+ %xindex_5 = arith.addi %xindex_4, %xindex : tensor<128xi32, #blocked> loc(#loc37)
21
+ %xmask = tt.splat %xnumel : i32 -> tensor<128xi32, #blocked> loc(#loc38)
22
+ %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<128xi32, #blocked> loc(#loc38)
23
+ %x0 = arith.extsi %xindex_5 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> loc(#loc39)
24
+ %x0_7 = tt.splat %ks0 : i64 -> tensor<128xi64, #blocked> loc(#loc39)
25
+ %x0_8 = arith.remsi %x0, %x0_7 : tensor<128xi64, #blocked> loc(#loc39)
26
+ %quot = arith.divsi %x0, %x0_7 : tensor<128xi64, #blocked> loc(#loc49)
27
+ %fixed = arith.cmpi ne, %x0_8, %cst_0 : tensor<128xi64, #blocked> loc(#loc50)
28
+ %fixed_9 = arith.subi %quot, %cst_2 : tensor<128xi64, #blocked> loc(#loc51)
29
+ %fixed_10 = arith.select %fixed, %fixed_9, %quot : tensor<128xi1, #blocked>, tensor<128xi64, #blocked> loc(#loc52)
30
+ %x1 = arith.cmpi slt, %xindex_5, %cst_1 : tensor<128xi32, #blocked> loc(#loc53)
31
+ %x1_11 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc54)
32
+ %x1_12 = tt.splat %x1_11 : i1 -> tensor<128xi1, #blocked> loc(#loc55)
33
+ %x1_13 = arith.cmpi ne, %x1, %x1_12 : tensor<128xi1, #blocked> loc(#loc55)
34
+ %x1_14 = arith.select %x1_13, %fixed_10, %quot : tensor<128xi1, #blocked>, tensor<128xi64, #blocked> loc(#loc56)
35
+ %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked> loc(#loc45)
36
+ %tmp0_15 = tt.addptr %tmp0, %xindex_5 : tensor<128x!tt.ptr<f32>, #blocked>, tensor<128xi32, #blocked> loc(#loc45)
37
+ %tmp0_16 = tt.load %tmp0_15, %xmask_6 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>, #blocked> loc(#loc46)
38
+ %tmp2 = arith.mulf %tmp0_16, %cst : tensor<128xf32, #blocked> loc(#loc47)
39
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc20)
40
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc21)
41
+ %2 = arith.extui %1 : i1 to i64 loc(#loc22)
42
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc22)
43
+ %4 = arith.extui %0 : i1 to i64 loc(#loc48)
44
+ %5 = arith.addi %4, %3 : i64 loc(#loc23)
45
+ %6 = tt.splat %5 : i64 -> tensor<128xi64, #blocked> loc(#loc25)
46
+ %7 = arith.muli %x1_14, %6 : tensor<128xi64, #blocked> loc(#loc25)
47
+ %8 = arith.addi %x0_8, %7 : tensor<128xi64, #blocked> loc(#loc26)
48
+ %9 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked> loc(#loc27)
49
+ %10 = tt.addptr %9, %8 : tensor<128x!tt.ptr<f32>, #blocked>, tensor<128xi64, #blocked> loc(#loc27)
50
+ tt.store %10, %tmp2, %xmask_6 : tensor<128x!tt.ptr<f32>, #blocked> loc(#loc28)
51
+ tt.return loc(#loc29)
52
+ } loc(#loc)
53
+ } loc(#loc)
54
+ #loc1 = loc(unknown)
55
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:28)
56
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:33)
57
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:36)
58
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:23)
59
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":21:21)
60
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":23:19)
61
+ #loc8 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
62
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":24:51)
63
+ #loc10 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
64
+ #loc11 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
65
+ #loc12 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
66
+ #loc13 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
67
+ #loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
68
+ #loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
69
+ #loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
70
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:30)
71
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:35)
72
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":27:18)
73
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:49)
74
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:75)
75
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:66)
76
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:57)
77
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:41)
78
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:34)
79
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:30)
80
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:25)
81
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:88)
82
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:4)
83
+ #loc34 = loc("xoffset"(#loc2))
84
+ #loc35 = loc("xoffset"(#loc3))
85
+ #loc36 = loc("xindex"(#loc4))
86
+ #loc37 = loc("xindex"(#loc5))
87
+ #loc38 = loc("xmask"(#loc6))
88
+ #loc39 = loc("x0"(#loc7))
89
+ #loc40 = loc("quot"(#loc8))
90
+ #loc41 = loc("x1"(#loc9))
91
+ #loc42 = loc("fixed"(#loc10))
92
+ #loc43 = loc("fixed"(#loc11))
93
+ #loc44 = loc("fixed"(#loc12))
94
+ #loc45 = loc("tmp0"(#loc17))
95
+ #loc46 = loc("tmp0"(#loc18))
96
+ #loc47 = loc("tmp2"(#loc19))
97
+ #loc48 = loc(fused[#loc23, #loc24])
98
+ #loc49 = loc(callsite(#loc40 at #loc41))
99
+ #loc50 = loc(callsite(#loc42 at #loc41))
100
+ #loc51 = loc(callsite(#loc43 at #loc41))
101
+ #loc52 = loc(callsite(#loc44 at #loc41))
102
+ #loc53 = loc(callsite(#loc13 at #loc41))
103
+ #loc54 = loc(callsite(#loc14 at #loc41))
104
+ #loc55 = loc(callsite(#loc15 at #loc41))
105
+ #loc56 = loc(callsite(#loc16 at #loc41))
progress/github/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":18:0)
2
+ #loc30 = loc("in_ptr0"(#loc))
3
+ #loc31 = loc("out_ptr0"(#loc))
4
+ #loc32 = loc("ks0"(#loc))
5
+ #loc33 = loc("xnumel"(#loc))
6
+ module {
7
+ tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
8
+ %fixed = arith.constant dense<1> : tensor<128xi64> loc(#loc49)
9
+ %x1 = arith.constant dense<0> : tensor<128xi32> loc(#loc50)
10
+ %fixed_0 = arith.constant dense<0> : tensor<128xi64> loc(#loc51)
11
+ %x1_1 = arith.constant 0 : i64 loc(#loc52)
12
+ %c1_i64 = arith.constant 1 : i64 loc(#loc6)
13
+ %tmp2 = arith.constant dense<0.693147182> : tensor<128xf32> loc(#loc37)
14
+ %c128_i32 = arith.constant 128 : i32 loc(#loc6)
15
+ %xoffset = tt.get_program_id x : i32 loc(#loc38)
16
+ %xoffset_2 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc39)
17
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc40)
18
+ %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc41)
19
+ %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc41)
20
+ %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc42)
21
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc42)
22
+ %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc43)
23
+ %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc43)
24
+ %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc43)
25
+ %quot = arith.divsi %x0, %x0_6 : tensor<128xi64> loc(#loc53)
26
+ %fixed_8 = arith.cmpi ne, %x0_7, %fixed_0 : tensor<128xi64> loc(#loc51)
27
+ %fixed_9 = arith.subi %quot, %fixed : tensor<128xi64> loc(#loc49)
28
+ %fixed_10 = arith.select %fixed_8, %fixed_9, %quot : tensor<128xi1>, tensor<128xi64> loc(#loc54)
29
+ %x1_11 = arith.cmpi slt, %xindex_4, %x1 : tensor<128xi32> loc(#loc50)
30
+ %x1_12 = arith.cmpi slt, %ks0, %x1_1 : i64 loc(#loc52)
31
+ %x1_13 = tt.splat %x1_12 : i1 -> tensor<128xi1> loc(#loc55)
32
+ %x1_14 = arith.cmpi ne, %x1_11, %x1_13 : tensor<128xi1> loc(#loc55)
33
+ %x1_15 = arith.select %x1_14, %fixed_10, %quot : tensor<128xi1>, tensor<128xi64> loc(#loc56)
34
+ %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc46)
35
+ %tmp0_16 = tt.addptr %tmp0, %xindex_4 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc46)
36
+ %tmp0_17 = tt.load %tmp0_16, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>> loc(#loc47)
37
+ %tmp2_18 = arith.mulf %tmp0_17, %tmp2 : tensor<128xf32> loc(#loc37)
38
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc20)
39
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc21)
40
+ %2 = arith.extui %1 : i1 to i64 loc(#loc22)
41
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc22)
42
+ %4 = arith.extui %0 : i1 to i64 loc(#loc48)
43
+ %5 = arith.addi %4, %3 : i64 loc(#loc23)
44
+ %6 = tt.splat %5 : i64 -> tensor<128xi64> loc(#loc25)
45
+ %7 = arith.muli %x1_15, %6 : tensor<128xi64> loc(#loc25)
46
+ %8 = arith.addi %x0_7, %7 : tensor<128xi64> loc(#loc26)
47
+ %9 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc27)
48
+ %10 = tt.addptr %9, %8 : tensor<128x!tt.ptr<f32>>, tensor<128xi64> loc(#loc27)
49
+ tt.store %10, %tmp2_18, %xmask_5 : tensor<128x!tt.ptr<f32>> loc(#loc28)
50
+ tt.return loc(#loc29)
51
+ } loc(#loc)
52
+ } loc(#loc)
53
+ #loc1 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
54
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":24:51)
55
+ #loc3 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
56
+ #loc4 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
57
+ #loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
58
+ #loc6 = loc(unknown)
59
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":27:18)
60
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:28)
61
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:33)
62
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:36)
63
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:23)
64
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":21:21)
65
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":23:19)
66
+ #loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
67
+ #loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
68
+ #loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
69
+ #loc17 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
70
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:30)
71
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:35)
72
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:49)
73
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:75)
74
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:66)
75
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:57)
76
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:41)
77
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:34)
78
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:30)
79
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:25)
80
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:88)
81
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:4)
82
+ #loc34 = loc("fixed"(#loc1))
83
+ #loc35 = loc("x1"(#loc2))
84
+ #loc36 = loc("fixed"(#loc4))
85
+ #loc37 = loc("tmp2"(#loc7))
86
+ #loc38 = loc("xoffset"(#loc8))
87
+ #loc39 = loc("xoffset"(#loc9))
88
+ #loc40 = loc("xindex"(#loc10))
89
+ #loc41 = loc("xindex"(#loc11))
90
+ #loc42 = loc("xmask"(#loc12))
91
+ #loc43 = loc("x0"(#loc13))
92
+ #loc44 = loc("quot"(#loc14))
93
+ #loc45 = loc("fixed"(#loc15))
94
+ #loc46 = loc("tmp0"(#loc18))
95
+ #loc47 = loc("tmp0"(#loc19))
96
+ #loc48 = loc(fused[#loc23, #loc24])
97
+ #loc49 = loc(callsite(#loc34 at #loc35))
98
+ #loc50 = loc(callsite(#loc3 at #loc35))
99
+ #loc51 = loc(callsite(#loc36 at #loc35))
100
+ #loc52 = loc(callsite(#loc5 at #loc35))
101
+ #loc53 = loc(callsite(#loc44 at #loc35))
102
+ #loc54 = loc(callsite(#loc45 at #loc35))
103
+ #loc55 = loc(callsite(#loc16 at #loc35))
104
+ #loc56 = loc(callsite(#loc17 at #loc35))
progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/__grp__triton_tem_fused_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json"}}
progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "ef65677dccb0fd1ca33e2efd85dd27b554735f6893116cae461084f5b56323fe", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 196608, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"}
progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ptx ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.source ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttgir ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttir ADDED
@@ -0,0 +1,896 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":200:41)
4
+ #loc66 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":568:16)
5
+ #loc111 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":449:51)
6
+ #loc123 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":462:34)
7
+ #loc167 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":235:45)
8
+ #loc193 = loc("arg_Q"(#loc))
9
+ #loc194 = loc("arg_K"(#loc))
10
+ #loc195 = loc("arg_V"(#loc))
11
+ #loc196 = loc("arg_M"(#loc))
12
+ #loc197 = loc("arg_L"(#loc))
13
+ #loc198 = loc("arg_KV_NUM_BLKS"(#loc))
14
+ #loc199 = loc("arg_KV_IDX"(#loc))
15
+ #loc200 = loc("arg_FULL_KV_NUM_BLKS"(#loc))
16
+ #loc201 = loc("arg_FULL_KV_IDX"(#loc))
17
+ #loc202 = loc("out_ptr0"(#loc))
18
+ #loc203 = loc("ks0"(#loc))
19
+ #loc204 = loc("ks1"(#loc))
20
+ #loc255 = loc(callsite(#loc66 at #loc2))
21
+ #loc296 = loc("m_ij"(#loc111))
22
+ #loc306 = loc("l_i"(#loc123))
23
+ #loc346 = loc(callsite(#loc66 at #loc167))
24
+ #loc406 = loc(callsite(#loc296 at #loc255))
25
+ #loc416 = loc(callsite(#loc306 at #loc255))
26
+ #loc435 = loc(callsite(#loc296 at #loc346))
27
+ #loc445 = loc(callsite(#loc306 at #loc346))
28
+ #loc465 = loc(callsite(#loc1 at #loc406))
29
+ #loc467 = loc(callsite(#loc1 at #loc416))
30
+ #loc495 = loc(callsite(#loc1 at #loc435))
31
+ #loc497 = loc(callsite(#loc1 at #loc445))
32
+ module {
33
+ tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_M: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_M"(#loc)), %arg_L: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_L"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc))) attributes {noinline = false} {
34
+ %cst = arith.constant dense<0> : tensor<1x128xi64> loc(#loc1)
35
+ %cst_0 = arith.constant dense<1024> : tensor<64x1xi32> loc(#loc1)
36
+ %cst_1 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1)
37
+ %cst_2 = arith.constant dense<16> : tensor<1x64xi32> loc(#loc205)
38
+ %cst_3 = arith.constant dense<16> : tensor<512x1xi32> loc(#loc205)
39
+ %cst_4 = arith.constant dense<0xFF800000> : tensor<512xf32> loc(#loc1)
40
+ %cst_5 = arith.constant dense<1.44269502> : tensor<512x64xf32> loc(#loc1)
41
+ %cst_6 = arith.constant dense<false> : tensor<512x64xi1> loc(#loc205)
42
+ %cst_7 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc205)
43
+ %cst_8 = arith.constant dense<1> : tensor<512x1xi32> loc(#loc205)
44
+ %cst_9 = arith.constant dense<0> : tensor<512x1xi32> loc(#loc205)
45
+ %cst_10 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc205)
46
+ %cst_11 = arith.constant dense<0xFF800000> : tensor<512x64xf32> loc(#loc1)
47
+ %cst_12 = arith.constant dense<0.0883883461> : tensor<512x64xf32> loc(#loc1)
48
+ %cst_13 = arith.constant dense<0.000000e+00> : tensor<512x64xf32> loc(#loc1)
49
+ %acc = arith.constant dense<0.000000e+00> : tensor<512x128xf32> loc(#loc360)
50
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1)
51
+ %c63_i32 = arith.constant 63 : i32 loc(#loc1)
52
+ %c31_i32 = arith.constant 31 : i32 loc(#loc1)
53
+ %cst_15 = arith.constant dense<128> : tensor<1x128x1xi32> loc(#loc1)
54
+ %mask = arith.constant dense<128> : tensor<1x1x128xi32> loc(#loc207)
55
+ %c0_i32 = arith.constant 0 : i32 loc(#loc1)
56
+ %c2_i32 = arith.constant 2 : i32 loc(#loc1)
57
+ %q_range = arith.constant dense<4096> : tensor<1x128x1xi32> loc(#loc208)
58
+ %cst_16 = arith.constant dense<128> : tensor<4x1x1xi32> loc(#loc1)
59
+ %true = arith.constant true loc(#loc7)
60
+ %c64_i32 = arith.constant 64 : i32 loc(#loc1)
61
+ %c4_i32 = arith.constant 4 : i32 loc(#loc1)
62
+ %HKV = arith.constant 8 : i32 loc(#loc209)
63
+ %c32_i32 = arith.constant 32 : i32 loc(#loc1)
64
+ %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
65
+ %c1_i32 = arith.constant 1 : i32 loc(#loc1)
66
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
67
+ %c512_i32 = arith.constant 512 : i32 loc(#loc9)
68
+ %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
69
+ %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc10)
70
+ %1 = arith.muli %ks0, %c1024_i32 : i32 loc(#loc11)
71
+ %2 = arith.muli %ks0, %c32_i32 : i32 loc(#loc12)
72
+ %TILE_KV_OG = arith.addi %ks1, %c31_i32 : i32 loc(#loc361)
73
+ %TILE_KV_OG_17 = arith.divsi %TILE_KV_OG, %c32_i32 : i32 loc(#loc362)
74
+ %TILE_KV = arith.addi %TILE_KV_OG_17, %c63_i32 : i32 loc(#loc363)
75
+ %TILE_KV_18 = arith.divsi %TILE_KV, %c64_i32 : i32 loc(#loc364)
76
+ %TILE_KV_19 = arith.muli %TILE_KV_18, %c64_i32 : i32 loc(#loc212)
77
+ %3 = arith.divsi %TILE_KV_19, %c64_i32 : i32 loc(#loc18)
78
+ %off_z = tt.get_program_id x : i32 loc(#loc213)
79
+ %off_z_20 = arith.divsi %off_z, %HKV : i32 loc(#loc214)
80
+ %off_hkv = arith.remsi %off_z, %HKV : i32 loc(#loc215)
81
+ %off_t = tt.get_program_id y : i32 loc(#loc216)
82
+ %q_offset = arith.muli %off_z_20, %0 : i32 loc(#loc217)
83
+ %q_offset_21 = arith.muli %off_hkv, %c512_i32 : i32 loc(#loc218)
84
+ %q_offset_22 = arith.addi %q_offset, %q_offset_21 : i32 loc(#loc219)
85
+ %k_offset = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc220)
86
+ %K = tt.addptr %arg_K, %k_offset : !tt.ptr<bf16>, i32 loc(#loc221)
87
+ %V = tt.addptr %arg_V, %k_offset : !tt.ptr<bf16>, i32 loc(#loc222)
88
+ tt.assert %true, "" : i1 loc(#loc7)
89
+ %off_g = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc223)
90
+ %off_m = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc224)
91
+ %offs_m = tt.expand_dims %off_m {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc225)
92
+ %offs_m_23 = tt.broadcast %offs_m : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc226)
93
+ %offs_m_24 = tt.reshape %offs_m_23 : tensor<4x128xi32> -> tensor<512xi32> loc(#loc365)
94
+ %block_n_start = arith.muli %off_t, %3 : i32 loc(#loc228)
95
+ %block_n_end = arith.addi %block_n_start, %3 : i32 loc(#loc229)
96
+ %q_range_25 = tt.expand_dims %off_g {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc230)
97
+ %q_range_26 = tt.expand_dims %q_range_25 {axis = 2 : i32} : tensor<4x1xi32> -> tensor<4x1x1xi32> loc(#loc230)
98
+ %q_range_27 = arith.muli %q_range_26, %cst_16 : tensor<4x1x1xi32> loc(#loc231)
99
+ %q_range_28 = tt.expand_dims %offs_m {axis = 2 : i32} : tensor<1x128xi32> -> tensor<1x128x1xi32> loc(#loc232)
100
+ %q_range_29 = arith.muli %q_range_28, %q_range : tensor<1x128x1xi32> loc(#loc208)
101
+ %q_range_30 = tt.broadcast %q_range_27 : tensor<4x1x1xi32> -> tensor<4x128x1xi32> loc(#loc233)
102
+ %q_range_31 = tt.broadcast %q_range_29 : tensor<1x128x1xi32> -> tensor<4x128x1xi32> loc(#loc233)
103
+ %q_range_32 = arith.addi %q_range_30, %q_range_31 : tensor<4x128x1xi32> loc(#loc233)
104
+ %q_range_33 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<1x128xi32> -> tensor<1x1x128xi32> loc(#loc234)
105
+ %q_range_34 = tt.broadcast %q_range_32 : tensor<4x128x1xi32> -> tensor<4x128x128xi32> loc(#loc235)
106
+ %q_range_35 = tt.broadcast %q_range_33 : tensor<1x1x128xi32> -> tensor<4x128x128xi32> loc(#loc235)
107
+ %q_range_36 = arith.addi %q_range_34, %q_range_35 : tensor<4x128x128xi32> loc(#loc235)
108
+ %q = tt.splat %ks0 : i32 -> tensor<1x128x1xi32> loc(#loc236)
109
+ %q_37 = arith.cmpi slt, %q_range_28, %q : tensor<1x128x1xi32> loc(#loc236)
110
+ %q_38 = tt.addptr %arg_Q, %q_offset_22 : !tt.ptr<bf16>, i32 loc(#loc237)
111
+ %q_39 = tt.splat %q_38 : !tt.ptr<bf16> -> tensor<4x128x128x!tt.ptr<bf16>> loc(#loc238)
112
+ %q_40 = tt.addptr %q_39, %q_range_36 : tensor<4x128x128x!tt.ptr<bf16>>, tensor<4x128x128xi32> loc(#loc238)
113
+ %q_41 = tt.broadcast %q_37 : tensor<1x128x1xi1> -> tensor<4x128x128xi1> loc(#loc239)
114
+ %q_42 = tt.load %q_40, %q_41 : tensor<4x128x128x!tt.ptr<bf16>> loc(#loc239)
115
+ %q_43 = tt.reshape %q_42 : tensor<4x128x128xbf16> -> tensor<512x128xbf16> loc(#loc240)
116
+ %kv_num_blocks = tt.load %arg_KV_NUM_BLKS : !tt.ptr<i32> loc(#loc241)
117
+ %off_n_block_in_sparse = arith.remsi %block_n_start, %c2_i32 : i32 loc(#loc242)
118
+ %off_n = tt.load %arg_KV_IDX : !tt.ptr<i32> loc(#loc243)
119
+ %off_n_44 = arith.muli %off_n, %c128_i32 : i32 loc(#loc244)
120
+ %off_n_45 = arith.muli %off_n_block_in_sparse, %c64_i32 : i32 loc(#loc245)
121
+ %off_n_46 = arith.addi %off_n_44, %off_n_45 : i32 loc(#loc246)
122
+ %block_n_last_valid = arith.muli %kv_num_blocks, %c2_i32 : i32 loc(#loc247)
123
+ %block_n_last_valid_47 = arith.addi %ks1, %c63_i32 : i32 loc(#loc366)
124
+ %block_n_last_valid_48 = arith.divsi %block_n_last_valid_47, %c64_i32 : i32 loc(#loc367)
125
+ %block_n_last_valid_49 = arith.maxsi %block_n_last_valid_48, %c1_i32 : i32 loc(#loc249)
126
+ %block_n_last_valid_50 = arith.minsi %block_n_last_valid, %block_n_last_valid_49 : i32 loc(#loc250)
127
+ %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc251)
128
+ %offs_n_51 = tt.splat %off_n_46 : i32 -> tensor<64xi32> loc(#loc252)
129
+ %offs_n_52 = arith.addi %offs_n, %offs_n_51 : tensor<64xi32> loc(#loc252)
130
+ %4 = tt.expand_dims %offs_m_24 {axis = 1 : i32} : tensor<512xi32> -> tensor<512x1xi32> loc(#loc60)
131
+ %5 = tt.expand_dims %offs_n_52 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc61)
132
+ %6 = arith.cmpi sle, %block_n_end, %block_n_last_valid_50 : i32 loc(#loc62)
133
+ %7 = arith.select %6, %block_n_end, %block_n_last_valid_50 : i32 loc(#loc63)
134
+ %kv_offset:5 = scf.for %start_n = %block_n_start to %7 step %c1_i32 iter_args(%acc_78 = %acc, %l_i_79 = %cst_14, %m_i_80 = %cst_4, %offs_n_81 = %5, %kv_offset_82 = %c0_i32) -> (tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32) : i32 {
135
+ %kv_base_offset = arith.addi %off_n_46, %kv_offset_82 : i32 loc(#loc369)
136
+ %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc370)
137
+ %offs_n_load_83 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc370)
138
+ %ptr = tt.expand_dims %offs_n_load_83 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc456)
139
+ %ptr_84 = arith.muli %ptr, %cst_0 : tensor<64x1xi32> loc(#loc457)
140
+ %ptr_85 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc458)
141
+ %ptr_86 = tt.addptr %ptr_85, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc458)
142
+ %ptr_87 = tt.broadcast %ptr_86 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc459)
143
+ %ptr_88 = tt.broadcast %offs_m : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc459)
144
+ %ptr_89 = tt.addptr %ptr_87, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc459)
145
+ %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc460)
146
+ %k_90 = arith.cmpi slt, %ptr, %k : tensor<64x1xi32> loc(#loc460)
147
+ %k_91 = tt.broadcast %k_90 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc461)
148
+ %k_92 = tt.load %ptr_89, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc461)
149
+ %k_93 = tt.trans %k_92 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc372)
150
+ %qk = tt.dot %q_43, %k_93, %cst_13, inputPrecision = tf32 : tensor<512x128xbf16> * tensor<128x64xbf16> -> tensor<512x64xf32> loc(#loc373)
151
+ %qk_94 = arith.mulf %qk, %cst_12 : tensor<512x64xf32> loc(#loc374)
152
+ %m = tt.splat %ks0 : i32 -> tensor<512x1xi32> loc(#loc462)
153
+ %m_95 = arith.remsi %4, %m : tensor<512x1xi32> loc(#loc462)
154
+ %n = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc463)
155
+ %n_96 = arith.remsi %offs_n_81, %n : tensor<1x64xi32> loc(#loc463)
156
+ %post_mod_scores = arith.cmpi slt, %offs_n_81, %n : tensor<1x64xi32> loc(#loc377)
157
+ %post_mod_scores_97 = tt.broadcast %post_mod_scores : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc378)
158
+ %post_mod_scores_98 = arith.select %post_mod_scores_97, %qk_94, %cst_11 : tensor<512x64xi1>, tensor<512x64xf32> loc(#loc378)
159
+ %tmp3 = arith.cmpi slt, %m_95, %cst_9 : tensor<512x1xi32> loc(#loc379)
160
+ %tmp5 = tt.broadcast %n_96 : tensor<1x64xi32> -> tensor<512x64xi32> loc(#loc380)
161
+ %tmp5_99 = tt.broadcast %m_95 : tensor<512x1xi32> -> tensor<512x64xi32> loc(#loc380)
162
+ %tmp5_100 = arith.cmpi sle, %tmp5, %tmp5_99 : tensor<512x64xi32> loc(#loc380)
163
+ %tmp6 = tt.broadcast %tmp3 : tensor<512x1xi1> -> tensor<512x64xi1> loc(#loc381)
164
+ %tmp6_101 = arith.andi %tmp6, %tmp5_100 : tensor<512x64xi1> loc(#loc381)
165
+ %tmp7 = arith.cmpi sge, %m_95, %cst_9 : tensor<512x1xi32> loc(#loc382)
166
+ %tmp8 = arith.cmpi slt, %n_96, %cst_10 : tensor<1x64xi32> loc(#loc383)
167
+ %tmp9 = tt.broadcast %tmp7 : tensor<512x1xi1> -> tensor<512x64xi1> loc(#loc384)
168
+ %tmp9_102 = tt.broadcast %tmp8 : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc384)
169
+ %tmp9_103 = arith.andi %tmp9, %tmp9_102 : tensor<512x64xi1> loc(#loc384)
170
+ %tmp10 = arith.extui %tmp8 : tensor<1x64xi1> to tensor<1x64xi32> loc(#loc385)
171
+ %tmp10_104 = arith.cmpi eq, %tmp10, %cst_10 : tensor<1x64xi32> loc(#loc385)
172
+ %tmp11 = tt.broadcast %tmp10_104 : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc386)
173
+ %tmp11_105 = arith.andi %tmp9, %tmp11 : tensor<512x64xi1> loc(#loc386)
174
+ %tmp14 = arith.remsi %m_95, %cst_3 : tensor<512x1xi32> loc(#loc387)
175
+ %tmp14_106 = arith.cmpi ne, %tmp14, %cst_9 : tensor<512x1xi32> loc(#loc388)
176
+ %tmp14_107 = arith.divsi %m_95, %cst_3 : tensor<512x1xi32> loc(#loc389)
177
+ %tmp14_108 = arith.subi %tmp14_107, %cst_8 : tensor<512x1xi32> loc(#loc390)
178
+ %tmp14_109 = arith.select %tmp14_106, %tmp14_108, %tmp14_107 : tensor<512x1xi1>, tensor<512x1xi32> loc(#loc391)
179
+ %tmp14_110 = arith.select %tmp3, %tmp14_109, %tmp14_107 : tensor<512x1xi1>, tensor<512x1xi32> loc(#loc392)
180
+ %tmp16 = arith.remsi %n_96, %cst_2 : tensor<1x64xi32> loc(#loc393)
181
+ %tmp16_111 = arith.cmpi ne, %tmp16, %cst_10 : tensor<1x64xi32> loc(#loc394)
182
+ %tmp16_112 = arith.divsi %n_96, %cst_2 : tensor<1x64xi32> loc(#loc395)
183
+ %tmp16_113 = arith.subi %tmp16_112, %cst_7 : tensor<1x64xi32> loc(#loc396)
184
+ %tmp16_114 = arith.select %tmp16_111, %tmp16_113, %tmp16_112 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc397)
185
+ %tmp16_115 = arith.select %tmp8, %tmp16_114, %tmp16_112 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc398)
186
+ %tmp17 = tt.broadcast %tmp14_110 : tensor<512x1xi32> -> tensor<512x64xi32> loc(#loc399)
187
+ %tmp17_116 = tt.broadcast %tmp16_115 : tensor<1x64xi32> -> tensor<512x64xi32> loc(#loc399)
188
+ %tmp17_117 = arith.cmpi eq, %tmp17, %tmp17_116 : tensor<512x64xi32> loc(#loc399)
189
+ %tmp18 = arith.andi %tmp11_105, %tmp17_117 : tensor<512x64xi1> loc(#loc400)
190
+ %tmp19 = arith.ori %tmp9_103, %tmp18 : tensor<512x64xi1> loc(#loc401)
191
+ %tmp20 = arith.ori %tmp6_101, %tmp19 : tensor<512x64xi1> loc(#loc402)
192
+ %mask_mod_output = arith.select %post_mod_scores_97, %tmp20, %cst_6 : tensor<512x64xi1>, tensor<512x64xi1> loc(#loc403)
193
+ %post_mod_scores_118 = arith.select %mask_mod_output, %post_mod_scores_98, %cst_11 : tensor<512x64xi1>, tensor<512x64xf32> loc(#loc404)
194
+ %post_mod_scores_119 = arith.mulf %post_mod_scores_118, %cst_5 : tensor<512x64xf32> loc(#loc405)
195
+ %m_ij = "tt.reduce"(%post_mod_scores_119) <{axis = 1 : i32}> ({
196
+ ^bb0(%m_ij_152: f32 loc(callsite(#loc1 at #loc406)), %m_ij_153: f32 loc(callsite(#loc1 at #loc406))):
197
+ %m_ij_154 = arith.maxnumf %m_ij_152, %m_ij_153 : f32 loc(#loc519)
198
+ tt.reduce.return %m_ij_154 : f32 loc(#loc464)
199
+ }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc464)
200
+ %m_ij_120 = arith.maxnumf %m_i_80, %m_ij : tensor<512xf32> loc(#loc407)
201
+ %masked_out_rows = arith.cmpf oeq, %m_ij_120, %cst_4 : tensor<512xf32> loc(#loc408)
202
+ %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_120 : tensor<512xi1>, tensor<512xf32> loc(#loc409)
203
+ %alpha = arith.subf %m_i_80, %m_ij_masked : tensor<512xf32> loc(#loc410)
204
+ %alpha_121 = math.exp2 %alpha : tensor<512xf32> loc(#loc411)
205
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc412)
206
+ %p_122 = tt.broadcast %p : tensor<512x1xf32> -> tensor<512x64xf32> loc(#loc413)
207
+ %p_123 = arith.subf %post_mod_scores_119, %p_122 : tensor<512x64xf32> loc(#loc413)
208
+ %p_124 = math.exp2 %p_123 : tensor<512x64xf32> loc(#loc414)
209
+ %l_i_125 = arith.mulf %l_i_79, %alpha_121 : tensor<512xf32> loc(#loc415)
210
+ %l_i_126 = "tt.reduce"(%p_124) <{axis = 1 : i32}> ({
211
+ ^bb0(%l_i_152: f32 loc(callsite(#loc1 at #loc416)), %l_i_153: f32 loc(callsite(#loc1 at #loc416))):
212
+ %l_i_154 = arith.addf %l_i_152, %l_i_153 : f32 loc(#loc520)
213
+ tt.reduce.return %l_i_154 : f32 loc(#loc466)
214
+ }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc466)
215
+ %l_i_127 = arith.addf %l_i_125, %l_i_126 : tensor<512xf32> loc(#loc417)
216
+ %acc_128 = tt.expand_dims %alpha_121 {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc418)
217
+ %acc_129 = tt.broadcast %acc_128 : tensor<512x1xf32> -> tensor<512x128xf32> loc(#loc419)
218
+ %acc_130 = arith.mulf %acc_78, %acc_129 : tensor<512x128xf32> loc(#loc419)
219
+ %ptr_131 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc468)
220
+ %ptr_132 = tt.addptr %ptr_131, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc468)
221
+ %ptr_133 = tt.broadcast %ptr_132 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc469)
222
+ %ptr_134 = tt.addptr %ptr_133, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc469)
223
+ %v = tt.load %ptr_134, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc470)
224
+ %acc_135 = arith.truncf %p_124 : tensor<512x64xf32> to tensor<512x64xbf16> loc(#loc421)
225
+ %acc_136 = tt.dot %acc_135, %v, %acc_130, inputPrecision = tf32 : tensor<512x64xbf16> * tensor<64x128xbf16> -> tensor<512x128xf32> loc(#loc422)
226
+ %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc471)
227
+ %cur_block = tt.addptr %arg_KV_IDX, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc472)
228
+ %cur_block_137 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc473)
229
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc474)
230
+ %next_block_138 = arith.cmpi slt, %next_block, %kv_num_blocks : i32 loc(#loc475)
231
+ %next_block_139 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc476)
232
+ %next_block_140 = tt.load %next_block_139, %next_block_138 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc477)
233
+ %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc478)
234
+ %needs_jump_141 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc479)
235
+ %needs_jump_142 = arith.cmpi eq, %needs_jump_141, %c0_i32 : i32 loc(#loc480)
236
+ %jump_to_block = arith.subi %next_block_140, %cur_block_137 : i32 loc(#loc481)
237
+ %jump_to_block_143 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc482)
238
+ %jump_to_block_144 = arith.subi %jump_to_block_143, %c64_i32 : i32 loc(#loc483)
239
+ %offset = arith.extui %needs_jump_142 : i1 to i32 loc(#loc484)
240
+ %offset_145 = arith.muli %jump_to_block_144, %offset : i32 loc(#loc484)
241
+ %offset_146 = arith.subi %c1_i32, %offset : i32 loc(#loc485)
242
+ %offset_147 = arith.muli %offset_146, %c64_i32 : i32 loc(#loc486)
243
+ %offset_148 = arith.addi %offset_145, %offset_147 : i32 loc(#loc487)
244
+ %offs_n_149 = tt.splat %offset_148 : i32 -> tensor<1x64xi32> loc(#loc424)
245
+ %offs_n_150 = arith.addi %offs_n_81, %offs_n_149 : tensor<1x64xi32> loc(#loc424)
246
+ %kv_offset_151 = arith.addi %kv_offset_82, %offset_148 : i32 loc(#loc425)
247
+ scf.yield %acc_136, %l_i_127, %m_ij_120, %offs_n_150, %kv_offset_151 : tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32 loc(#loc333)
248
+ } loc(#loc524)
249
+ %kv_num_blocks_53 = tt.load %arg_FULL_KV_NUM_BLKS : !tt.ptr<i32> loc(#loc334)
250
+ %block_n_start_54 = arith.subi %c31_i32, %off_t : i32 loc(#loc335)
251
+ %block_n_start_55 = arith.muli %block_n_start_54, %3 : i32 loc(#loc336)
252
+ %block_n_end_56 = arith.addi %block_n_start_55, %3 : i32 loc(#loc337)
253
+ %off_n_block_in_sparse_57 = arith.remsi %block_n_start_55, %c2_i32 : i32 loc(#loc338)
254
+ %off_n_58 = tt.load %arg_FULL_KV_IDX : !tt.ptr<i32> loc(#loc339)
255
+ %off_n_59 = arith.muli %off_n_58, %c128_i32 : i32 loc(#loc340)
256
+ %off_n_60 = arith.muli %off_n_block_in_sparse_57, %c64_i32 : i32 loc(#loc341)
257
+ %off_n_61 = arith.addi %off_n_59, %off_n_60 : i32 loc(#loc342)
258
+ %block_n_last_valid_62 = arith.muli %kv_num_blocks_53, %c2_i32 : i32 loc(#loc343)
259
+ %block_n_last_valid_63 = arith.minsi %block_n_last_valid_62, %block_n_last_valid_49 : i32 loc(#loc344)
260
+ %offs_n_64 = tt.splat %off_n_61 : i32 -> tensor<64xi32> loc(#loc345)
261
+ %offs_n_65 = arith.addi %offs_n, %offs_n_64 : tensor<64xi32> loc(#loc345)
262
+ %8 = tt.expand_dims %offs_n_65 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc164)
263
+ %9 = arith.cmpi sle, %block_n_end_56, %block_n_last_valid_63 : i32 loc(#loc165)
264
+ %10 = arith.select %9, %block_n_end_56, %block_n_last_valid_63 : i32 loc(#loc166)
265
+ %kv_offset_66:5 = scf.for %start_n = %block_n_start_55 to %10 step %c1_i32 iter_args(%acc_78 = %kv_offset#0, %l_i_79 = %kv_offset#1, %m_i_80 = %kv_offset#2, %offs_n_81 = %8, %kv_offset_82 = %c0_i32) -> (tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32) : i32 {
266
+ %kv_base_offset = arith.addi %off_n_61, %kv_offset_82 : i32 loc(#loc426)
267
+ %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc427)
268
+ %offs_n_load_83 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc427)
269
+ %ptr = tt.expand_dims %offs_n_load_83 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc488)
270
+ %ptr_84 = arith.muli %ptr, %cst_0 : tensor<64x1xi32> loc(#loc489)
271
+ %ptr_85 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc490)
272
+ %ptr_86 = tt.addptr %ptr_85, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc490)
273
+ %ptr_87 = tt.broadcast %ptr_86 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc491)
274
+ %ptr_88 = tt.broadcast %offs_m : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc491)
275
+ %ptr_89 = tt.addptr %ptr_87, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc491)
276
+ %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc492)
277
+ %k_90 = arith.cmpi slt, %ptr, %k : tensor<64x1xi32> loc(#loc492)
278
+ %k_91 = tt.broadcast %k_90 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc493)
279
+ %k_92 = tt.load %ptr_89, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc493)
280
+ %k_93 = tt.trans %k_92 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc429)
281
+ %qk = tt.dot %q_43, %k_93, %cst_13, inputPrecision = tf32 : tensor<512x128xbf16> * tensor<128x64xbf16> -> tensor<512x64xf32> loc(#loc430)
282
+ %qk_94 = arith.mulf %qk, %cst_12 : tensor<512x64xf32> loc(#loc431)
283
+ %post_mod_scores = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc432)
284
+ %post_mod_scores_95 = arith.cmpi slt, %offs_n_81, %post_mod_scores : tensor<1x64xi32> loc(#loc432)
285
+ %post_mod_scores_96 = tt.broadcast %post_mod_scores_95 : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc433)
286
+ %post_mod_scores_97 = arith.select %post_mod_scores_96, %qk_94, %cst_11 : tensor<512x64xi1>, tensor<512x64xf32> loc(#loc433)
287
+ %post_mod_scores_98 = arith.mulf %post_mod_scores_97, %cst_5 : tensor<512x64xf32> loc(#loc434)
288
+ %m_ij = "tt.reduce"(%post_mod_scores_98) <{axis = 1 : i32}> ({
289
+ ^bb0(%m_ij_131: f32 loc(callsite(#loc1 at #loc435)), %m_ij_132: f32 loc(callsite(#loc1 at #loc435))):
290
+ %m_ij_133 = arith.maxnumf %m_ij_131, %m_ij_132 : f32 loc(#loc521)
291
+ tt.reduce.return %m_ij_133 : f32 loc(#loc494)
292
+ }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc494)
293
+ %m_ij_99 = arith.maxnumf %m_i_80, %m_ij : tensor<512xf32> loc(#loc436)
294
+ %masked_out_rows = arith.cmpf oeq, %m_ij_99, %cst_4 : tensor<512xf32> loc(#loc437)
295
+ %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_99 : tensor<512xi1>, tensor<512xf32> loc(#loc438)
296
+ %alpha = arith.subf %m_i_80, %m_ij_masked : tensor<512xf32> loc(#loc439)
297
+ %alpha_100 = math.exp2 %alpha : tensor<512xf32> loc(#loc440)
298
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc441)
299
+ %p_101 = tt.broadcast %p : tensor<512x1xf32> -> tensor<512x64xf32> loc(#loc442)
300
+ %p_102 = arith.subf %post_mod_scores_98, %p_101 : tensor<512x64xf32> loc(#loc442)
301
+ %p_103 = math.exp2 %p_102 : tensor<512x64xf32> loc(#loc443)
302
+ %l_i_104 = arith.mulf %l_i_79, %alpha_100 : tensor<512xf32> loc(#loc444)
303
+ %l_i_105 = "tt.reduce"(%p_103) <{axis = 1 : i32}> ({
304
+ ^bb0(%l_i_131: f32 loc(callsite(#loc1 at #loc445)), %l_i_132: f32 loc(callsite(#loc1 at #loc445))):
305
+ %l_i_133 = arith.addf %l_i_131, %l_i_132 : f32 loc(#loc522)
306
+ tt.reduce.return %l_i_133 : f32 loc(#loc496)
307
+ }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc496)
308
+ %l_i_106 = arith.addf %l_i_104, %l_i_105 : tensor<512xf32> loc(#loc446)
309
+ %acc_107 = tt.expand_dims %alpha_100 {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc447)
310
+ %acc_108 = tt.broadcast %acc_107 : tensor<512x1xf32> -> tensor<512x128xf32> loc(#loc448)
311
+ %acc_109 = arith.mulf %acc_78, %acc_108 : tensor<512x128xf32> loc(#loc448)
312
+ %ptr_110 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc498)
313
+ %ptr_111 = tt.addptr %ptr_110, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc498)
314
+ %ptr_112 = tt.broadcast %ptr_111 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc499)
315
+ %ptr_113 = tt.addptr %ptr_112, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc499)
316
+ %v = tt.load %ptr_113, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc500)
317
+ %acc_114 = arith.truncf %p_103 : tensor<512x64xf32> to tensor<512x64xbf16> loc(#loc450)
318
+ %acc_115 = tt.dot %acc_114, %v, %acc_109, inputPrecision = tf32 : tensor<512x64xbf16> * tensor<64x128xbf16> -> tensor<512x128xf32> loc(#loc451)
319
+ %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc501)
320
+ %cur_block = tt.addptr %arg_FULL_KV_IDX, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc502)
321
+ %cur_block_116 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc503)
322
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc504)
323
+ %next_block_117 = arith.cmpi slt, %next_block, %kv_num_blocks_53 : i32 loc(#loc505)
324
+ %next_block_118 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc506)
325
+ %next_block_119 = tt.load %next_block_118, %next_block_117 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc507)
326
+ %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc508)
327
+ %needs_jump_120 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc509)
328
+ %needs_jump_121 = arith.cmpi eq, %needs_jump_120, %c0_i32 : i32 loc(#loc510)
329
+ %jump_to_block = arith.subi %next_block_119, %cur_block_116 : i32 loc(#loc511)
330
+ %jump_to_block_122 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc512)
331
+ %jump_to_block_123 = arith.subi %jump_to_block_122, %c64_i32 : i32 loc(#loc513)
332
+ %offset = arith.extui %needs_jump_121 : i1 to i32 loc(#loc514)
333
+ %offset_124 = arith.muli %jump_to_block_123, %offset : i32 loc(#loc514)
334
+ %offset_125 = arith.subi %c1_i32, %offset : i32 loc(#loc515)
335
+ %offset_126 = arith.muli %offset_125, %c64_i32 : i32 loc(#loc516)
336
+ %offset_127 = arith.addi %offset_124, %offset_126 : i32 loc(#loc517)
337
+ %offs_n_128 = tt.splat %offset_127 : i32 -> tensor<1x64xi32> loc(#loc453)
338
+ %offs_n_129 = arith.addi %offs_n_81, %offs_n_128 : tensor<1x64xi32> loc(#loc453)
339
+ %kv_offset_130 = arith.addi %kv_offset_82, %offset_127 : i32 loc(#loc454)
340
+ scf.yield %acc_115, %l_i_106, %m_ij_99, %offs_n_129, %kv_offset_130 : tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32 loc(#loc347)
341
+ } loc(#loc525)
342
+ %m_offset = arith.muli %off_t, %2 : i32 loc(#loc348)
343
+ %m_offset_67 = arith.muli %off_z_20, %1 : i32 loc(#loc349)
344
+ %m_offset_68 = arith.addi %m_offset, %m_offset_67 : i32 loc(#loc350)
345
+ %M_block_ptr = tt.addptr %arg_M, %m_offset_68 : !tt.ptr<f32>, i32 loc(#loc351)
346
+ %M_block_ptr_69 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc352)
347
+ %M_block_ptr_70 = arith.extsi %ks0 : i32 to i64 loc(#loc353)
348
+ %M_block_ptr_71 = arith.extsi %M_block_ptr_69 : i32 to i64 loc(#loc353)
349
+ %L_block_ptr = tt.addptr %arg_L, %m_offset_68 : !tt.ptr<f32>, i32 loc(#loc354)
350
+ %m_i = tt.reshape %kv_offset_66#2 : tensor<512xf32> -> tensor<4x128xf32> loc(#loc355)
351
+ %l_i = tt.reshape %kv_offset_66#1 : tensor<512xf32> -> tensor<4x128xf32> loc(#loc356)
352
+ %11 = tt.splat %M_block_ptr : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>> loc(#loc177)
353
+ %12 = tt.splat %M_block_ptr_71 : i64 -> tensor<4xi64> loc(#loc177)
354
+ %13 = arith.extsi %off_g : tensor<4xi32> to tensor<4xi64> loc(#loc177)
355
+ %14 = arith.addi %12, %13 : tensor<4xi64> loc(#loc177)
356
+ %15 = tt.expand_dims %14 {axis = 1 : i32} : tensor<4xi64> -> tensor<4x1xi64> loc(#loc177)
357
+ %16 = tt.splat %M_block_ptr_70 : i64 -> tensor<4x1xi64> loc(#loc177)
358
+ %17 = arith.muli %15, %16 : tensor<4x1xi64> loc(#loc177)
359
+ %18 = tt.broadcast %17 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc177)
360
+ %19 = arith.extsi %off_m : tensor<128xi32> to tensor<128xi64> loc(#loc177)
361
+ %20 = tt.expand_dims %19 {axis = 0 : i32} : tensor<128xi64> -> tensor<1x128xi64> loc(#loc177)
362
+ %21 = tt.broadcast %20 : tensor<1x128xi64> -> tensor<4x128xi64> loc(#loc177)
363
+ %22 = arith.addi %18, %21 : tensor<4x128xi64> loc(#loc177)
364
+ %23 = tt.addptr %11, %22 : tensor<4x128x!tt.ptr<f32>>, tensor<4x128xi64> loc(#loc177)
365
+ %24 = arith.cmpi sge, %20, %cst : tensor<1x128xi64> loc(#loc177)
366
+ %25 = tt.splat %M_block_ptr_70 : i64 -> tensor<1x128xi64> loc(#loc177)
367
+ %26 = arith.cmpi slt, %20, %25 : tensor<1x128xi64> loc(#loc177)
368
+ %27 = arith.andi %24, %26 : tensor<1x128xi1> loc(#loc177)
369
+ %28 = tt.broadcast %27 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc177)
370
+ tt.store %23, %m_i, %28 : tensor<4x128x!tt.ptr<f32>> loc(#loc177)
371
+ %29 = tt.splat %L_block_ptr : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>> loc(#loc178)
372
+ %30 = tt.addptr %29, %22 : tensor<4x128x!tt.ptr<f32>>, tensor<4x128xi64> loc(#loc178)
373
+ tt.store %30, %l_i, %28 : tensor<4x128x!tt.ptr<f32>> loc(#loc178)
374
+ %idx_hq = tt.splat %M_block_ptr_69 : i32 -> tensor<4x1x1xi32> loc(#loc357)
375
+ %idx_hq_72 = arith.addi %idx_hq, %q_range_26 : tensor<4x1x1xi32> loc(#loc357)
376
+ %mask_73 = arith.cmpi slt, %q_range_33, %mask : tensor<1x1x128xi32> loc(#loc207)
377
+ %mask_74 = tt.broadcast %q_37 : tensor<1x128x1xi1> -> tensor<1x128x128xi1> loc(#loc358)
378
+ %mask_75 = tt.broadcast %mask_73 : tensor<1x1x128xi1> -> tensor<1x128x128xi1> loc(#loc358)
379
+ %mask_76 = arith.andi %mask_74, %mask_75 : tensor<1x128x128xi1> loc(#loc358)
380
+ %acc_77 = tt.reshape %kv_offset_66#0 : tensor<512x128xf32> -> tensor<4x128x128xf32> loc(#loc359)
381
+ %31 = arith.muli %q_range_28, %cst_15 : tensor<1x128x1xi32> loc(#loc182)
382
+ %32 = tt.broadcast %q_range_33 : tensor<1x1x128xi32> -> tensor<1x128x128xi32> loc(#loc183)
383
+ %33 = tt.broadcast %31 : tensor<1x128x1xi32> -> tensor<1x128x128xi32> loc(#loc183)
384
+ %34 = arith.addi %32, %33 : tensor<1x128x128xi32> loc(#loc183)
385
+ %35 = arith.muli %idx_hq_72, %cst_16 : tensor<4x1x1xi32> loc(#loc184)
386
+ %36 = tt.splat %ks0 : i32 -> tensor<4x1x1xi32> loc(#loc185)
387
+ %37 = arith.muli %35, %36 : tensor<4x1x1xi32> loc(#loc185)
388
+ %38 = tt.broadcast %34 : tensor<1x128x128xi32> -> tensor<4x128x128xi32> loc(#loc186)
389
+ %39 = tt.broadcast %37 : tensor<4x1x1xi32> -> tensor<4x128x128xi32> loc(#loc186)
390
+ %40 = arith.addi %38, %39 : tensor<4x128x128xi32> loc(#loc186)
391
+ %41 = arith.muli %off_t, %c4096_i32 : i32 loc(#loc187)
392
+ %42 = arith.muli %41, %ks0 : i32 loc(#loc188)
393
+ %43 = tt.splat %42 : i32 -> tensor<4x128x128xi32> loc(#loc189)
394
+ %44 = arith.addi %40, %43 : tensor<4x128x128xi32> loc(#loc189)
395
+ %45 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<4x128x128x!tt.ptr<f32>> loc(#loc190)
396
+ %46 = tt.addptr %45, %44 : tensor<4x128x128x!tt.ptr<f32>>, tensor<4x128x128xi32> loc(#loc190)
397
+ %47 = tt.broadcast %mask_76 : tensor<1x128x128xi1> -> tensor<4x128x128xi1> loc(#loc191)
398
+ tt.store %46, %acc_77, %47 : tensor<4x128x128x!tt.ptr<f32>> loc(#loc191)
399
+ tt.return loc(#loc192)
400
+ } loc(#loc)
401
+ } loc(#loc)
402
+ #loc3 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
403
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":132:19)
404
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":276:38)
405
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:61)
406
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":135:21)
407
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":95:10)
408
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":86:60)
409
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":86:65)
410
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":89:54)
411
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":89:62)
412
+ #loc13 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22)
413
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":104:33)
414
+ #loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28)
415
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":105:34)
416
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":105:45)
417
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":106:49)
418
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":108:26)
419
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":108:48)
420
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":110:49)
421
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":111:26)
422
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":113:23)
423
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":113:45)
424
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":113:35)
425
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":114:47)
426
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":117:12)
427
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":118:12)
428
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":137:25)
429
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":140:25)
430
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":141:44)
431
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":141:54)
432
+ #loc33 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":74:27)
433
+ #loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":141:22)
434
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":152:28)
435
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":153:34)
436
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:32)
437
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:26)
438
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:67)
439
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:49)
440
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:103)
441
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:84)
442
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":162:72)
443
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":162:24)
444
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":162:35)
445
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":162:20)
446
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":166:22)
447
+ #loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":173:28)
448
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":176:44)
449
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":177:20)
450
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":177:48)
451
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":177:95)
452
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":177:71)
453
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":181:52)
454
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":181:99)
455
+ #loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":181:109)
456
+ #loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":181:72)
457
+ #loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":183:26)
458
+ #loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":183:37)
459
+ #loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":194:40)
460
+ #loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":194:57)
461
+ #loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":198:53)
462
+ #loc63 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":198:38)
463
+ #loc64 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":532:40)
464
+ #loc65 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":388:32)
465
+ #loc67 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":392:35)
466
+ #loc68 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":328:27)
467
+ #loc69 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":393:107)
468
+ #loc70 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":328:38)
469
+ #loc71 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":328:20)
470
+ #loc72 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":328:49)
471
+ #loc73 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":336:52)
472
+ #loc74 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":336:23)
473
+ #loc75 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":395:17)
474
+ #loc76 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":397:19)
475
+ #loc77 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":399:14)
476
+ #loc78 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":301:21)
477
+ #loc79 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":404:36)
478
+ #loc80 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":405:36)
479
+ #loc81 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":413:44)
480
+ #loc82 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":413:69)
481
+ #loc83 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":418:22)
482
+ #loc84 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":420:23)
483
+ #loc85 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":421:22)
484
+ #loc86 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":422:23)
485
+ #loc87 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":423:22)
486
+ #loc88 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":424:22)
487
+ #loc89 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":425:24)
488
+ #loc90 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":426:23)
489
+ #loc91 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:70)
490
+ #loc92 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:79)
491
+ #loc93 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:91)
492
+ #loc94 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:99)
493
+ #loc95 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:102)
494
+ #loc96 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:119)
495
+ #loc97 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:70)
496
+ #loc98 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:79)
497
+ #loc99 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:91)
498
+ #loc100 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:99)
499
+ #loc101 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:102)
500
+ #loc102 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:119)
501
+ #loc103 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":432:25)
502
+ #loc104 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":433:24)
503
+ #loc105 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":434:23)
504
+ #loc106 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":435:23)
505
+ #loc107 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":440:73)
506
+ #loc108 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":442:69)
507
+ #loc109 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":445:27)
508
+ #loc110 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40)
509
+ #loc112 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27)
510
+ #loc113 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":449:27)
511
+ #loc114 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":451:35)
512
+ #loc115 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":452:51)
513
+ #loc116 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":456:31)
514
+ #loc117 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":456:25)
515
+ #loc118 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":457:51)
516
+ #loc119 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":457:39)
517
+ #loc120 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":457:21)
518
+ #loc121 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":462:16)
519
+ #loc122 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
520
+ #loc124 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
521
+ #loc125 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":462:24)
522
+ #loc126 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":464:22)
523
+ #loc127 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":464:16)
524
+ #loc128 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":467:107)
525
+ #loc129 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":468:22)
526
+ #loc130 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":468:44)
527
+ #loc131 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":291:33)
528
+ #loc132 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":575:63)
529
+ #loc133 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":292:38)
530
+ #loc134 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":292:24)
531
+ #loc135 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":293:109)
532
+ #loc136 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":293:113)
533
+ #loc137 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":293:55)
534
+ #loc138 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":293:25)
535
+ #loc139 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":294:30)
536
+ #loc140 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":294:35)
537
+ #loc141 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":294:60)
538
+ #loc142 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":295:34)
539
+ #loc143 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":295:48)
540
+ #loc144 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":295:63)
541
+ #loc145 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":296:29)
542
+ #loc146 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":296:47)
543
+ #loc147 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":296:61)
544
+ #loc148 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":296:42)
545
+ #loc149 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":578:26)
546
+ #loc150 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":579:21)
547
+ #loc151 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":579:8)
548
+ #loc152 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":210:32)
549
+ #loc153 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":212:44)
550
+ #loc154 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":212:49)
551
+ #loc155 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":213:38)
552
+ #loc156 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":215:48)
553
+ #loc157 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":216:24)
554
+ #loc158 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":216:52)
555
+ #loc159 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":216:99)
556
+ #loc160 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":216:75)
557
+ #loc161 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":219:56)
558
+ #loc162 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":219:76)
559
+ #loc163 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":221:41)
560
+ #loc164 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":229:61)
561
+ #loc165 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":233:57)
562
+ #loc166 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":233:42)
563
+ #loc168 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":239:23)
564
+ #loc169 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":239:43)
565
+ #loc170 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":239:35)
566
+ #loc171 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":243:17)
567
+ #loc172 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":246:25)
568
+ #loc173 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":248:8)
569
+ #loc174 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":251:17)
570
+ #loc175 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":260:25)
571
+ #loc176 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":261:25)
572
+ #loc177 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":266:30)
573
+ #loc178 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":267:30)
574
+ #loc179 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":272:25)
575
+ #loc180 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":276:30)
576
+ #loc181 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":277:41)
577
+ #loc182 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:53)
578
+ #loc183 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:49)
579
+ #loc184 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:65)
580
+ #loc185 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:72)
581
+ #loc186 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:61)
582
+ #loc187 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:83)
583
+ #loc188 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:89)
584
+ #loc189 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:78)
585
+ #loc190 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:25)
586
+ #loc191 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:112)
587
+ #loc192 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:4)
588
+ #loc205 = loc(callsite(#loc1 at #loc2))
589
+ #loc206 = loc("acc"(#loc4))
590
+ #loc207 = loc("mask"(#loc5))
591
+ #loc208 = loc("q_range"(#loc6))
592
+ #loc209 = loc("HKV"(#loc8))
593
+ #loc210 = loc("TILE_KV_OG"(#loc14))
594
+ #loc211 = loc("TILE_KV"(#loc16))
595
+ #loc212 = loc("TILE_KV"(#loc17))
596
+ #loc213 = loc("off_z"(#loc19))
597
+ #loc214 = loc("off_z"(#loc20))
598
+ #loc215 = loc("off_hkv"(#loc21))
599
+ #loc216 = loc("off_t"(#loc22))
600
+ #loc217 = loc("q_offset"(#loc23))
601
+ #loc218 = loc("q_offset"(#loc24))
602
+ #loc219 = loc("q_offset"(#loc25))
603
+ #loc220 = loc("k_offset"(#loc26))
604
+ #loc221 = loc("K"(#loc27))
605
+ #loc222 = loc("V"(#loc28))
606
+ #loc223 = loc("off_g"(#loc29))
607
+ #loc224 = loc("off_m"(#loc30))
608
+ #loc225 = loc("offs_m"(#loc31))
609
+ #loc226 = loc("offs_m"(#loc32))
610
+ #loc227 = loc("offs_m"(#loc34))
611
+ #loc228 = loc("block_n_start"(#loc35))
612
+ #loc229 = loc("block_n_end"(#loc36))
613
+ #loc230 = loc("q_range"(#loc37))
614
+ #loc231 = loc("q_range"(#loc38))
615
+ #loc232 = loc("q_range"(#loc39))
616
+ #loc233 = loc("q_range"(#loc40))
617
+ #loc234 = loc("q_range"(#loc41))
618
+ #loc235 = loc("q_range"(#loc42))
619
+ #loc236 = loc("q"(#loc43))
620
+ #loc237 = loc("q"(#loc44))
621
+ #loc238 = loc("q"(#loc45))
622
+ #loc239 = loc("q"(#loc46))
623
+ #loc240 = loc("q"(#loc47))
624
+ #loc241 = loc("kv_num_blocks"(#loc48))
625
+ #loc242 = loc("off_n_block_in_sparse"(#loc49))
626
+ #loc243 = loc("off_n"(#loc50))
627
+ #loc244 = loc("off_n"(#loc51))
628
+ #loc245 = loc("off_n"(#loc52))
629
+ #loc246 = loc("off_n"(#loc53))
630
+ #loc247 = loc("block_n_last_valid"(#loc54))
631
+ #loc248 = loc("block_n_last_valid"(#loc55))
632
+ #loc249 = loc("block_n_last_valid"(#loc56))
633
+ #loc250 = loc("block_n_last_valid"(#loc57))
634
+ #loc251 = loc("offs_n"(#loc58))
635
+ #loc252 = loc("offs_n"(#loc59))
636
+ #loc253 = loc("acc"(#loc64))
637
+ #loc254 = loc("kv_base_offset"(#loc65))
638
+ #loc256 = loc("offs_n_load"(#loc67))
639
+ #loc257 = loc("ptr"(#loc68))
640
+ #loc258 = loc("k"(#loc69))
641
+ #loc259 = loc("ptr"(#loc70))
642
+ #loc260 = loc("ptr"(#loc71))
643
+ #loc261 = loc("ptr"(#loc72))
644
+ #loc262 = loc("k"(#loc75))
645
+ #loc263 = loc("qk"(#loc76))
646
+ #loc264 = loc("qk"(#loc77))
647
+ #loc265 = loc("m"(#loc79))
648
+ #loc266 = loc("n"(#loc80))
649
+ #loc267 = loc("post_mod_scores"(#loc81))
650
+ #loc268 = loc("post_mod_scores"(#loc82))
651
+ #loc269 = loc("tmp3"(#loc83))
652
+ #loc270 = loc("tmp5"(#loc84))
653
+ #loc271 = loc("tmp6"(#loc85))
654
+ #loc272 = loc("tmp7"(#loc86))
655
+ #loc273 = loc("tmp8"(#loc87))
656
+ #loc274 = loc("tmp9"(#loc88))
657
+ #loc275 = loc("tmp10"(#loc89))
658
+ #loc276 = loc("tmp11"(#loc90))
659
+ #loc277 = loc("tmp14"(#loc91))
660
+ #loc278 = loc("tmp14"(#loc92))
661
+ #loc279 = loc("tmp14"(#loc93))
662
+ #loc280 = loc("tmp14"(#loc94))
663
+ #loc281 = loc("tmp14"(#loc95))
664
+ #loc282 = loc("tmp14"(#loc96))
665
+ #loc283 = loc("tmp16"(#loc97))
666
+ #loc284 = loc("tmp16"(#loc98))
667
+ #loc285 = loc("tmp16"(#loc99))
668
+ #loc286 = loc("tmp16"(#loc100))
669
+ #loc287 = loc("tmp16"(#loc101))
670
+ #loc288 = loc("tmp16"(#loc102))
671
+ #loc289 = loc("tmp17"(#loc103))
672
+ #loc290 = loc("tmp18"(#loc104))
673
+ #loc291 = loc("tmp19"(#loc105))
674
+ #loc292 = loc("tmp20"(#loc106))
675
+ #loc293 = loc("mask_mod_output"(#loc107))
676
+ #loc294 = loc("post_mod_scores"(#loc108))
677
+ #loc295 = loc("post_mod_scores"(#loc109))
678
+ #loc297 = loc("m_ij"(#loc113))
679
+ #loc298 = loc("masked_out_rows"(#loc114))
680
+ #loc299 = loc("m_ij_masked"(#loc115))
681
+ #loc300 = loc("alpha"(#loc116))
682
+ #loc301 = loc("alpha"(#loc117))
683
+ #loc302 = loc("p"(#loc118))
684
+ #loc303 = loc("p"(#loc119))
685
+ #loc304 = loc("p"(#loc120))
686
+ #loc305 = loc("l_i"(#loc121))
687
+ #loc307 = loc("l_i"(#loc125))
688
+ #loc308 = loc("acc"(#loc126))
689
+ #loc309 = loc("acc"(#loc127))
690
+ #loc310 = loc("v"(#loc128))
691
+ #loc311 = loc("acc"(#loc129))
692
+ #loc312 = loc("acc"(#loc130))
693
+ #loc313 = loc("cur_block_idx"(#loc131))
694
+ #loc314 = loc("offset"(#loc132))
695
+ #loc315 = loc("cur_block"(#loc133))
696
+ #loc316 = loc("cur_block"(#loc134))
697
+ #loc317 = loc("next_block"(#loc135))
698
+ #loc318 = loc("next_block"(#loc136))
699
+ #loc319 = loc("next_block"(#loc137))
700
+ #loc320 = loc("next_block"(#loc138))
701
+ #loc321 = loc("needs_jump"(#loc139))
702
+ #loc322 = loc("needs_jump"(#loc140))
703
+ #loc323 = loc("needs_jump"(#loc141))
704
+ #loc324 = loc("jump_to_block"(#loc142))
705
+ #loc325 = loc("jump_to_block"(#loc143))
706
+ #loc326 = loc("jump_to_block"(#loc144))
707
+ #loc327 = loc("offset"(#loc145))
708
+ #loc328 = loc("offset"(#loc146))
709
+ #loc329 = loc("offset"(#loc147))
710
+ #loc330 = loc("offset"(#loc148))
711
+ #loc331 = loc("offs_n"(#loc149))
712
+ #loc332 = loc("kv_offset"(#loc150))
713
+ #loc333 = loc(callsite(#loc151 at #loc2))
714
+ #loc334 = loc("kv_num_blocks"(#loc152))
715
+ #loc335 = loc("block_n_start"(#loc153))
716
+ #loc336 = loc("block_n_start"(#loc154))
717
+ #loc337 = loc("block_n_end"(#loc155))
718
+ #loc338 = loc("off_n_block_in_sparse"(#loc156))
719
+ #loc339 = loc("off_n"(#loc157))
720
+ #loc340 = loc("off_n"(#loc158))
721
+ #loc341 = loc("off_n"(#loc159))
722
+ #loc342 = loc("off_n"(#loc160))
723
+ #loc343 = loc("block_n_last_valid"(#loc161))
724
+ #loc344 = loc("block_n_last_valid"(#loc162))
725
+ #loc345 = loc("offs_n"(#loc163))
726
+ #loc347 = loc(callsite(#loc151 at #loc167))
727
+ #loc348 = loc("m_offset"(#loc168))
728
+ #loc349 = loc("m_offset"(#loc169))
729
+ #loc350 = loc("m_offset"(#loc170))
730
+ #loc351 = loc("M_block_ptr"(#loc171))
731
+ #loc352 = loc("M_block_ptr"(#loc172))
732
+ #loc353 = loc("M_block_ptr"(#loc173))
733
+ #loc354 = loc("L_block_ptr"(#loc174))
734
+ #loc355 = loc("m_i"(#loc175))
735
+ #loc356 = loc("l_i"(#loc176))
736
+ #loc357 = loc("idx_hq"(#loc179))
737
+ #loc358 = loc("mask"(#loc180))
738
+ #loc359 = loc("acc"(#loc181))
739
+ #loc360 = loc(callsite(#loc3 at #loc206))
740
+ #loc361 = loc(callsite(#loc13 at #loc210))
741
+ #loc362 = loc(callsite(#loc15 at #loc210))
742
+ #loc363 = loc(callsite(#loc13 at #loc211))
743
+ #loc364 = loc(callsite(#loc15 at #loc211))
744
+ #loc365 = loc(callsite(#loc33 at #loc227))
745
+ #loc366 = loc(callsite(#loc13 at #loc248))
746
+ #loc367 = loc(callsite(#loc15 at #loc248))
747
+ #loc368 = loc("l_i"(#loc253))
748
+ #loc369 = loc(callsite(#loc254 at #loc255))
749
+ #loc370 = loc(callsite(#loc256 at #loc255))
750
+ #loc371 = loc(callsite(#loc258 at #loc255))
751
+ #loc372 = loc(callsite(#loc262 at #loc255))
752
+ #loc373 = loc(callsite(#loc263 at #loc255))
753
+ #loc374 = loc(callsite(#loc264 at #loc255))
754
+ #loc375 = loc(callsite(#loc265 at #loc255))
755
+ #loc376 = loc(callsite(#loc266 at #loc255))
756
+ #loc377 = loc(callsite(#loc267 at #loc255))
757
+ #loc378 = loc(callsite(#loc268 at #loc255))
758
+ #loc379 = loc(callsite(#loc269 at #loc255))
759
+ #loc380 = loc(callsite(#loc270 at #loc255))
760
+ #loc381 = loc(callsite(#loc271 at #loc255))
761
+ #loc382 = loc(callsite(#loc272 at #loc255))
762
+ #loc383 = loc(callsite(#loc273 at #loc255))
763
+ #loc384 = loc(callsite(#loc274 at #loc255))
764
+ #loc385 = loc(callsite(#loc275 at #loc255))
765
+ #loc386 = loc(callsite(#loc276 at #loc255))
766
+ #loc387 = loc(callsite(#loc277 at #loc255))
767
+ #loc388 = loc(callsite(#loc278 at #loc255))
768
+ #loc389 = loc(callsite(#loc279 at #loc255))
769
+ #loc390 = loc(callsite(#loc280 at #loc255))
770
+ #loc391 = loc(callsite(#loc281 at #loc255))
771
+ #loc392 = loc(callsite(#loc282 at #loc255))
772
+ #loc393 = loc(callsite(#loc283 at #loc255))
773
+ #loc394 = loc(callsite(#loc284 at #loc255))
774
+ #loc395 = loc(callsite(#loc285 at #loc255))
775
+ #loc396 = loc(callsite(#loc286 at #loc255))
776
+ #loc397 = loc(callsite(#loc287 at #loc255))
777
+ #loc398 = loc(callsite(#loc288 at #loc255))
778
+ #loc399 = loc(callsite(#loc289 at #loc255))
779
+ #loc400 = loc(callsite(#loc290 at #loc255))
780
+ #loc401 = loc(callsite(#loc291 at #loc255))
781
+ #loc402 = loc(callsite(#loc292 at #loc255))
782
+ #loc403 = loc(callsite(#loc293 at #loc255))
783
+ #loc404 = loc(callsite(#loc294 at #loc255))
784
+ #loc405 = loc(callsite(#loc295 at #loc255))
785
+ #loc407 = loc(callsite(#loc297 at #loc255))
786
+ #loc408 = loc(callsite(#loc298 at #loc255))
787
+ #loc409 = loc(callsite(#loc299 at #loc255))
788
+ #loc410 = loc(callsite(#loc300 at #loc255))
789
+ #loc411 = loc(callsite(#loc301 at #loc255))
790
+ #loc412 = loc(callsite(#loc302 at #loc255))
791
+ #loc413 = loc(callsite(#loc303 at #loc255))
792
+ #loc414 = loc(callsite(#loc304 at #loc255))
793
+ #loc415 = loc(callsite(#loc305 at #loc255))
794
+ #loc417 = loc(callsite(#loc307 at #loc255))
795
+ #loc418 = loc(callsite(#loc308 at #loc255))
796
+ #loc419 = loc(callsite(#loc309 at #loc255))
797
+ #loc420 = loc(callsite(#loc310 at #loc255))
798
+ #loc421 = loc(callsite(#loc311 at #loc255))
799
+ #loc422 = loc(callsite(#loc312 at #loc255))
800
+ #loc423 = loc(callsite(#loc314 at #loc2))
801
+ #loc424 = loc(callsite(#loc331 at #loc2))
802
+ #loc425 = loc(callsite(#loc332 at #loc2))
803
+ #loc426 = loc(callsite(#loc254 at #loc346))
804
+ #loc427 = loc(callsite(#loc256 at #loc346))
805
+ #loc428 = loc(callsite(#loc258 at #loc346))
806
+ #loc429 = loc(callsite(#loc262 at #loc346))
807
+ #loc430 = loc(callsite(#loc263 at #loc346))
808
+ #loc431 = loc(callsite(#loc264 at #loc346))
809
+ #loc432 = loc(callsite(#loc267 at #loc346))
810
+ #loc433 = loc(callsite(#loc268 at #loc346))
811
+ #loc434 = loc(callsite(#loc295 at #loc346))
812
+ #loc436 = loc(callsite(#loc297 at #loc346))
813
+ #loc437 = loc(callsite(#loc298 at #loc346))
814
+ #loc438 = loc(callsite(#loc299 at #loc346))
815
+ #loc439 = loc(callsite(#loc300 at #loc346))
816
+ #loc440 = loc(callsite(#loc301 at #loc346))
817
+ #loc441 = loc(callsite(#loc302 at #loc346))
818
+ #loc442 = loc(callsite(#loc303 at #loc346))
819
+ #loc443 = loc(callsite(#loc304 at #loc346))
820
+ #loc444 = loc(callsite(#loc305 at #loc346))
821
+ #loc446 = loc(callsite(#loc307 at #loc346))
822
+ #loc447 = loc(callsite(#loc308 at #loc346))
823
+ #loc448 = loc(callsite(#loc309 at #loc346))
824
+ #loc449 = loc(callsite(#loc310 at #loc346))
825
+ #loc450 = loc(callsite(#loc311 at #loc346))
826
+ #loc451 = loc(callsite(#loc312 at #loc346))
827
+ #loc452 = loc(callsite(#loc314 at #loc167))
828
+ #loc453 = loc(callsite(#loc331 at #loc167))
829
+ #loc454 = loc(callsite(#loc332 at #loc167))
830
+ #loc455 = loc("m_i"(#loc368))
831
+ #loc456 = loc(callsite(#loc257 at #loc371))
832
+ #loc457 = loc(callsite(#loc259 at #loc371))
833
+ #loc458 = loc(callsite(#loc260 at #loc371))
834
+ #loc459 = loc(callsite(#loc261 at #loc371))
835
+ #loc460 = loc(callsite(#loc73 at #loc371))
836
+ #loc461 = loc(callsite(#loc74 at #loc371))
837
+ #loc462 = loc(callsite(#loc78 at #loc375))
838
+ #loc463 = loc(callsite(#loc78 at #loc376))
839
+ #loc464 = loc(callsite(#loc110 at #loc406))
840
+ #loc466 = loc(callsite(#loc122 at #loc416))
841
+ #loc468 = loc(callsite(#loc260 at #loc420))
842
+ #loc469 = loc(callsite(#loc261 at #loc420))
843
+ #loc470 = loc(callsite(#loc74 at #loc420))
844
+ #loc471 = loc(callsite(#loc313 at #loc423))
845
+ #loc472 = loc(callsite(#loc315 at #loc423))
846
+ #loc473 = loc(callsite(#loc316 at #loc423))
847
+ #loc474 = loc(callsite(#loc317 at #loc423))
848
+ #loc475 = loc(callsite(#loc318 at #loc423))
849
+ #loc476 = loc(callsite(#loc319 at #loc423))
850
+ #loc477 = loc(callsite(#loc320 at #loc423))
851
+ #loc478 = loc(callsite(#loc321 at #loc423))
852
+ #loc479 = loc(callsite(#loc322 at #loc423))
853
+ #loc480 = loc(callsite(#loc323 at #loc423))
854
+ #loc481 = loc(callsite(#loc324 at #loc423))
855
+ #loc482 = loc(callsite(#loc325 at #loc423))
856
+ #loc483 = loc(callsite(#loc326 at #loc423))
857
+ #loc484 = loc(callsite(#loc327 at #loc423))
858
+ #loc485 = loc(callsite(#loc328 at #loc423))
859
+ #loc486 = loc(callsite(#loc329 at #loc423))
860
+ #loc487 = loc(callsite(#loc330 at #loc423))
861
+ #loc488 = loc(callsite(#loc257 at #loc428))
862
+ #loc489 = loc(callsite(#loc259 at #loc428))
863
+ #loc490 = loc(callsite(#loc260 at #loc428))
864
+ #loc491 = loc(callsite(#loc261 at #loc428))
865
+ #loc492 = loc(callsite(#loc73 at #loc428))
866
+ #loc493 = loc(callsite(#loc74 at #loc428))
867
+ #loc494 = loc(callsite(#loc110 at #loc435))
868
+ #loc496 = loc(callsite(#loc122 at #loc445))
869
+ #loc498 = loc(callsite(#loc260 at #loc449))
870
+ #loc499 = loc(callsite(#loc261 at #loc449))
871
+ #loc500 = loc(callsite(#loc74 at #loc449))
872
+ #loc501 = loc(callsite(#loc313 at #loc452))
873
+ #loc502 = loc(callsite(#loc315 at #loc452))
874
+ #loc503 = loc(callsite(#loc316 at #loc452))
875
+ #loc504 = loc(callsite(#loc317 at #loc452))
876
+ #loc505 = loc(callsite(#loc318 at #loc452))
877
+ #loc506 = loc(callsite(#loc319 at #loc452))
878
+ #loc507 = loc(callsite(#loc320 at #loc452))
879
+ #loc508 = loc(callsite(#loc321 at #loc452))
880
+ #loc509 = loc(callsite(#loc322 at #loc452))
881
+ #loc510 = loc(callsite(#loc323 at #loc452))
882
+ #loc511 = loc(callsite(#loc324 at #loc452))
883
+ #loc512 = loc(callsite(#loc325 at #loc452))
884
+ #loc513 = loc(callsite(#loc326 at #loc452))
885
+ #loc514 = loc(callsite(#loc327 at #loc452))
886
+ #loc515 = loc(callsite(#loc328 at #loc452))
887
+ #loc516 = loc(callsite(#loc329 at #loc452))
888
+ #loc517 = loc(callsite(#loc330 at #loc452))
889
+ #loc518 = loc("offs_n"(#loc455))
890
+ #loc519 = loc(callsite(#loc112 at #loc464))
891
+ #loc520 = loc(callsite(#loc124 at #loc466))
892
+ #loc521 = loc(callsite(#loc112 at #loc494))
893
+ #loc522 = loc(callsite(#loc124 at #loc496))
894
+ #loc523 = loc("kv_offset"(#loc518))
895
+ #loc524 = loc(callsite(#loc523 at #loc2))
896
+ #loc525 = loc(callsite(#loc523 at #loc167))
progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/__grp__triton_tem_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_tem_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.source", "triton_tem_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttir", "triton_tem_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttgir", "triton_tem_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.llir", "triton_tem_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ptx", "triton_tem_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.cubin", "triton_tem_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.json"}}
progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "e9189965502344ccf889732c148fe3546cbfbba32ff4ff89a591e4bb193cb58d", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 164864, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_mul_1"}
progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.llir ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ptx ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.source ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttgir ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttir ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/__grp__triton_poi_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_poi_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.source", "triton_poi_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.ttir", "triton_poi_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.ttgir", "triton_poi_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.llir", "triton_poi_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.ptx", "triton_poi_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.cubin", "triton_poi_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.json"}}
progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.cubin ADDED
Binary file (18.7 kB). View file
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "ed7e788982b21fcc57c223c2b090190cce71d729f3cfd790c35719a5fbfc0487", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_1"}
progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.llir ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @global_smem = external addrspace(3) global [0 x i8], align 16
6
+
7
+ ; Function Attrs: nounwind
8
+ define ptx_kernel void @triton_poi_fused_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
9
+ %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
10
+ %8 = shl i32 %7, 9, !dbg !8
11
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
12
+ %10 = and i32 %9, 127, !dbg !9
13
+ %11 = shl nuw nsw i32 %10, 2, !dbg !9
14
+ %12 = or disjoint i32 %11, %8, !dbg !10
15
+ %13 = icmp slt i32 %12, %3, !dbg !11
16
+ %14 = or disjoint i32 %8, %10, !dbg !10
17
+ %15 = or disjoint i32 %14, 128, !dbg !10
18
+ %16 = or disjoint i32 %14, 256, !dbg !10
19
+ %17 = or disjoint i32 %14, 384, !dbg !10
20
+ %18 = icmp slt i32 %14, %3, !dbg !11
21
+ %19 = icmp slt i32 %15, %3, !dbg !11
22
+ %20 = icmp slt i32 %16, %3, !dbg !11
23
+ %21 = icmp slt i32 %17, %3, !dbg !11
24
+ %22 = insertelement <4 x i32> poison, i32 %17, i64 0, !dbg !12
25
+ %23 = insertelement <4 x i32> %22, i32 %16, i64 1, !dbg !12
26
+ %24 = insertelement <4 x i32> %23, i32 %15, i64 2, !dbg !12
27
+ %25 = insertelement <4 x i32> %24, i32 %14, i64 3, !dbg !12
28
+ %26 = sext <4 x i32> %25 to <4 x i64>, !dbg !12
29
+ %27 = insertelement <4 x i64> poison, i64 %2, i64 0, !dbg !12
30
+ %28 = shufflevector <4 x i64> %27, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !12
31
+ %29 = srem <4 x i64> %26, %28, !dbg !12
32
+ %30 = extractelement <4 x i64> %26, i64 3, !dbg !13
33
+ %31 = sdiv i64 %30, %2, !dbg !13
34
+ %32 = extractelement <4 x i64> %26, i64 2, !dbg !13
35
+ %33 = sdiv i64 %32, %2, !dbg !13
36
+ %34 = extractelement <4 x i64> %26, i64 1, !dbg !13
37
+ %35 = sdiv i64 %34, %2, !dbg !13
38
+ %36 = extractelement <4 x i64> %26, i64 0, !dbg !13
39
+ %37 = sdiv i64 %36, %2, !dbg !13
40
+ %38 = icmp ne <4 x i64> %29, zeroinitializer, !dbg !17
41
+ %39 = icmp slt i32 %8, 0, !dbg !18
42
+ %40 = icmp slt i64 %2, 0, !dbg !19
43
+ %41 = xor i1 %39, %40, !dbg !20
44
+ %42 = extractelement <4 x i1> %38, i64 3, !dbg !21
45
+ %narrow = select i1 %41, i1 %42, i1 false, !dbg !21
46
+ %43 = sext i1 %narrow to i64, !dbg !21
47
+ %44 = add nsw i64 %31, %43, !dbg !21
48
+ %45 = extractelement <4 x i1> %38, i64 2, !dbg !21
49
+ %narrow4 = select i1 %41, i1 %45, i1 false, !dbg !21
50
+ %46 = sext i1 %narrow4 to i64, !dbg !21
51
+ %47 = add nsw i64 %33, %46, !dbg !21
52
+ %48 = extractelement <4 x i1> %38, i64 1, !dbg !21
53
+ %narrow5 = select i1 %41, i1 %48, i1 false, !dbg !21
54
+ %49 = sext i1 %narrow5 to i64, !dbg !21
55
+ %50 = add nsw i64 %35, %49, !dbg !21
56
+ %51 = extractelement <4 x i1> %38, i64 0, !dbg !21
57
+ %narrow6 = select i1 %41, i1 %51, i1 false, !dbg !21
58
+ %52 = sext i1 %narrow6 to i64, !dbg !21
59
+ %53 = add nsw i64 %37, %52, !dbg !21
60
+ %54 = sext i32 %12 to i64, !dbg !22
61
+ %55 = getelementptr float, ptr addrspace(1) %0, i64 %54, !dbg !22
62
+ %56 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !23
63
+ %57 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l,b"(ptr addrspace(1) %55, i64 %56, i1 %13) #3, !dbg !23
64
+ %58 = extractvalue { i32, i32, i32, i32 } %57, 0, !dbg !23
65
+ %59 = extractvalue { i32, i32, i32, i32 } %57, 1, !dbg !23
66
+ %60 = extractvalue { i32, i32, i32, i32 } %57, 2, !dbg !23
67
+ %61 = extractvalue { i32, i32, i32, i32 } %57, 3, !dbg !23
68
+ %62 = insertelement <4 x i32> poison, i32 %58, i64 0, !dbg !23
69
+ %63 = insertelement <4 x i32> %62, i32 %59, i64 1, !dbg !23
70
+ %64 = insertelement <4 x i32> %63, i32 %60, i64 2, !dbg !23
71
+ %65 = insertelement <4 x i32> %64, i32 %61, i64 3, !dbg !23
72
+ %66 = bitcast <4 x i32> %65 to <4 x float>, !dbg !23
73
+ %67 = fmul <4 x float> %66, splat (float 0x3FE62E4300000000), !dbg !24
74
+ %68 = icmp slt i64 %2, 2, !dbg !25
75
+ %69 = icmp sgt i64 %2, 1, !dbg !26
76
+ %70 = select i1 %69, i64 %2, i64 0, !dbg !27
77
+ %71 = zext i1 %68 to i64, !dbg !28
78
+ %72 = add i64 %70, %71, !dbg !29
79
+ %73 = mul i64 %44, %72, !dbg !30
80
+ %74 = mul i64 %47, %72, !dbg !30
81
+ %75 = mul i64 %50, %72, !dbg !30
82
+ %76 = mul i64 %53, %72, !dbg !30
83
+ %77 = extractelement <4 x i64> %29, i64 3, !dbg !31
84
+ %78 = getelementptr float, ptr addrspace(1) %1, i64 %77, !dbg !31
85
+ %79 = getelementptr float, ptr addrspace(1) %78, i64 %73, !dbg !31
86
+ %80 = extractelement <4 x i64> %29, i64 2, !dbg !31
87
+ %81 = getelementptr float, ptr addrspace(1) %1, i64 %80, !dbg !31
88
+ %82 = getelementptr float, ptr addrspace(1) %81, i64 %74, !dbg !31
89
+ %83 = extractelement <4 x i64> %29, i64 1, !dbg !31
90
+ %84 = getelementptr float, ptr addrspace(1) %1, i64 %83, !dbg !31
91
+ %85 = getelementptr float, ptr addrspace(1) %84, i64 %75, !dbg !31
92
+ %86 = extractelement <4 x i64> %29, i64 0, !dbg !31
93
+ %87 = getelementptr float, ptr addrspace(1) %1, i64 %86, !dbg !31
94
+ %88 = getelementptr float, ptr addrspace(1) %87, i64 %76, !dbg !31
95
+ %89 = shl nuw nsw i32 %10, 4, !dbg !32
96
+ %90 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %89, !dbg !32
97
+ store <4 x float> %67, ptr addrspace(3) %90, align 16, !dbg !32
98
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !32
99
+ %91 = shl nuw nsw i32 %9, 6, !dbg !32
100
+ %92 = and i32 %91, 1536, !dbg !32
101
+ %93 = shl nuw nsw i32 %9, 4, !dbg !32
102
+ %94 = and i32 %93, 112, !dbg !32
103
+ %95 = shl nuw nsw i32 %9, 2, !dbg !32
104
+ %96 = and i32 %95, 384, !dbg !32
105
+ %97 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %92, !dbg !32
106
+ %98 = getelementptr inbounds nuw i8, ptr addrspace(3) %97, i32 %94, !dbg !32
107
+ %99 = getelementptr inbounds nuw i8, ptr addrspace(3) %98, i32 %96, !dbg !32
108
+ %100 = ptrtoint ptr addrspace(3) %99 to i32, !dbg !32
109
+ %101 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %100) #3, !dbg !32
110
+ %102 = extractvalue { i32, i32, i32, i32 } %101, 0, !dbg !32
111
+ %103 = extractvalue { i32, i32, i32, i32 } %101, 1, !dbg !32
112
+ %104 = extractvalue { i32, i32, i32, i32 } %101, 2, !dbg !32
113
+ %105 = extractvalue { i32, i32, i32, i32 } %101, 3, !dbg !32
114
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %102, ptr addrspace(1) %79, i1 %18) #3, !dbg !32
115
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %103, ptr addrspace(1) %82, i1 %19) #3, !dbg !32
116
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %104, ptr addrspace(1) %85, i1 %20) #3, !dbg !32
117
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %105, ptr addrspace(1) %88, i1 %21) #3, !dbg !32
118
+ ret void, !dbg !33
119
+ }
120
+
121
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
122
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
123
+
124
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
125
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
126
+
127
+ ; Function Attrs: convergent nocallback nounwind
128
+ declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
129
+
130
+ attributes #0 = { nounwind "nvvm.reqntid"="128" }
131
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
132
+ attributes #2 = { convergent nocallback nounwind }
133
+ attributes #3 = { nounwind }
134
+
135
+ !llvm.dbg.cu = !{!0}
136
+ !llvm.module.flags = !{!2, !3}
137
+
138
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
139
+ !1 = !DIFile(filename: "cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb")
140
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
141
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
142
+ !4 = distinct !DISubprogram(name: "triton_poi_fused_mul_1", linkageName: "triton_poi_fused_mul_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
143
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
144
+ !6 = !{}
145
+ !7 = !DILocation(line: 19, column: 28, scope: !4)
146
+ !8 = !DILocation(line: 19, column: 33, scope: !4)
147
+ !9 = !DILocation(line: 20, column: 36, scope: !4)
148
+ !10 = !DILocation(line: 20, column: 23, scope: !4)
149
+ !11 = !DILocation(line: 21, column: 21, scope: !4)
150
+ !12 = !DILocation(line: 23, column: 19, scope: !4)
151
+ !13 = !DILocation(line: 72, column: 16, scope: !14, inlinedAt: !16)
152
+ !14 = distinct !DILexicalBlockFile(scope: !4, file: !15, discriminator: 0)
153
+ !15 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
154
+ !16 = !DILocation(line: 24, column: 51, scope: !4)
155
+ !17 = !DILocation(line: 74, column: 34, scope: !14, inlinedAt: !16)
156
+ !18 = !DILocation(line: 75, column: 25, scope: !14, inlinedAt: !16)
157
+ !19 = !DILocation(line: 75, column: 36, scope: !14, inlinedAt: !16)
158
+ !20 = !DILocation(line: 75, column: 32, scope: !14, inlinedAt: !16)
159
+ !21 = !DILocation(line: 75, column: 47, scope: !14, inlinedAt: !16)
160
+ !22 = !DILocation(line: 25, column: 30, scope: !4)
161
+ !23 = !DILocation(line: 25, column: 35, scope: !4)
162
+ !24 = !DILocation(line: 27, column: 18, scope: !4)
163
+ !25 = !DILocation(line: 28, column: 49, scope: !4)
164
+ !26 = !DILocation(line: 28, column: 75, scope: !4)
165
+ !27 = !DILocation(line: 28, column: 66, scope: !4)
166
+ !28 = !DILocation(line: 28, scope: !4)
167
+ !29 = !DILocation(line: 28, column: 57, scope: !4)
168
+ !30 = !DILocation(line: 28, column: 34, scope: !4)
169
+ !31 = !DILocation(line: 28, column: 25, scope: !4)
170
+ !32 = !DILocation(line: 28, column: 88, scope: !4)
171
+ !33 = !DILocation(line: 28, column: 4, scope: !4)
progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.ptx ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_poi_fused_mul_1 // -- Begin function triton_poi_fused_mul_1
10
+ .extern .shared .align 16 .b8 global_smem[];
11
+ // @triton_poi_fused_mul_1
12
+ .visible .entry triton_poi_fused_mul_1(
13
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_0,
14
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_1,
15
+ .param .u64 triton_poi_fused_mul_1_param_2,
16
+ .param .u32 triton_poi_fused_mul_1_param_3,
17
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_4,
18
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_5
19
+ )
20
+ .reqntid 128
21
+ {
22
+ .reg .pred %p<23>;
23
+ .reg .b32 %r<56>;
24
+ .reg .b64 %rd<86>;
25
+ .loc 1 18 0 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:18:0
26
+ $L__func_begin0:
27
+ .loc 1 18 0 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:18:0
28
+
29
+ // %bb.0:
30
+ ld.param.b64 %rd27, [triton_poi_fused_mul_1_param_2];
31
+ $L__tmp0:
32
+ .loc 1 19 28 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:19:28
33
+ mov.u32 %r10, %ctaid.x;
34
+ .loc 1 19 33 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:19:33
35
+ shl.b32 %r1, %r10, 9;
36
+ .loc 1 20 36 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:20:36
37
+ mov.u32 %r2, %tid.x;
38
+ and.b32 %r3, %r2, 127;
39
+ .loc 1 20 23 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:20:23
40
+ or.b32 %r5, %r1, %r3;
41
+ or.b32 %r6, %r5, 128;
42
+ or.b32 %r7, %r5, 256;
43
+ .loc 1 23 19 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:23:19
44
+ cvt.s64.s32 %rd3, %r6;
45
+ cvt.s64.s32 %rd4, %r5;
46
+ $L__tmp1:
47
+ .loc 2 72 16 // triton_helpers.py:72:16 @[ cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:24:51 ]
48
+ or.b64 %rd28, %rd4, %rd27;
49
+ and.b64 %rd29, %rd28, -4294967296;
50
+ setp.ne.b64 %p1, %rd29, 0;
51
+ @%p1 bra $L__BB0_2;
52
+ bra.uni $L__BB0_1;
53
+ $L__BB0_2:
54
+ div.s64 %rd82, %rd4, %rd27;
55
+ bra.uni $L__BB0_3;
56
+ $L__BB0_1:
57
+ cvt.u32.u64 %r12, %rd27;
58
+ cvt.u32.u64 %r13, %rd4;
59
+ div.u32 %r14, %r13, %r12;
60
+ cvt.u64.u32 %rd82, %r14;
61
+ $L__tmp2:
62
+ $L__BB0_3:
63
+ .loc 1 0 0 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:0
64
+ or.b32 %r8, %r5, 384;
65
+ cvt.s64.s32 %rd2, %r7;
66
+ $L__tmp3:
67
+ .loc 2 72 16 // triton_helpers.py:72:16 @[ cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:24:51 ]
68
+ or.b64 %rd30, %rd3, %rd27;
69
+ and.b64 %rd31, %rd30, -4294967296;
70
+ setp.ne.b64 %p2, %rd31, 0;
71
+ @%p2 bra $L__BB0_5;
72
+ bra.uni $L__BB0_4;
73
+ $L__BB0_5:
74
+ div.s64 %rd83, %rd3, %rd27;
75
+ bra.uni $L__BB0_6;
76
+ $L__BB0_4:
77
+ cvt.u32.u64 %r15, %rd27;
78
+ cvt.u32.u64 %r16, %rd3;
79
+ div.u32 %r17, %r16, %r15;
80
+ cvt.u64.u32 %rd83, %r17;
81
+ $L__tmp4:
82
+ $L__BB0_6:
83
+ .loc 1 0 0 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:0
84
+ shl.b32 %r11, %r3, 2;
85
+ cvt.s64.s32 %rd1, %r8;
86
+ $L__tmp5:
87
+ .loc 2 72 16 // triton_helpers.py:72:16 @[ cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:24:51 ]
88
+ or.b64 %rd32, %rd2, %rd27;
89
+ and.b64 %rd33, %rd32, -4294967296;
90
+ setp.ne.b64 %p3, %rd33, 0;
91
+ @%p3 bra $L__BB0_8;
92
+ bra.uni $L__BB0_7;
93
+ $L__BB0_8:
94
+ div.s64 %rd84, %rd2, %rd27;
95
+ bra.uni $L__BB0_9;
96
+ $L__BB0_7:
97
+ cvt.u32.u64 %r18, %rd27;
98
+ cvt.u32.u64 %r19, %rd2;
99
+ div.u32 %r20, %r19, %r18;
100
+ cvt.u64.u32 %rd84, %r20;
101
+ $L__tmp6:
102
+ $L__BB0_9:
103
+ .loc 2 0 16 // triton_helpers.py:0:16
104
+ ld.param.b32 %r9, [triton_poi_fused_mul_1_param_3];
105
+ ld.param.b64 %rd26, [triton_poi_fused_mul_1_param_1];
106
+ ld.param.b64 %rd25, [triton_poi_fused_mul_1_param_0];
107
+ or.b32 %r4, %r11, %r1;
108
+ rem.s64 %rd8, %rd4, %rd27;
109
+ rem.s64 %rd7, %rd3, %rd27;
110
+ rem.s64 %rd6, %rd2, %rd27;
111
+ rem.s64 %rd5, %rd1, %rd27;
112
+ $L__tmp7:
113
+ .loc 2 72 16 // triton_helpers.py:72:16 @[ cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:24:51 ]
114
+ or.b64 %rd34, %rd1, %rd27;
115
+ and.b64 %rd35, %rd34, -4294967296;
116
+ setp.ne.b64 %p4, %rd35, 0;
117
+ @%p4 bra $L__BB0_11;
118
+ bra.uni $L__BB0_10;
119
+ $L__BB0_11:
120
+ div.s64 %rd85, %rd1, %rd27;
121
+ bra.uni $L__BB0_12;
122
+ $L__BB0_10:
123
+ cvt.u32.u64 %r21, %rd27;
124
+ cvt.u32.u64 %r22, %rd1;
125
+ div.u32 %r23, %r22, %r21;
126
+ cvt.u64.u32 %rd85, %r23;
127
+ $L__tmp8:
128
+ $L__BB0_12:
129
+ .loc 1 21 21 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:21:21
130
+ setp.lt.s32 %p9, %r8, %r9;
131
+ setp.lt.s32 %p8, %r7, %r9;
132
+ setp.lt.s32 %p7, %r6, %r9;
133
+ setp.lt.s32 %p6, %r5, %r9;
134
+ setp.lt.s32 %p5, %r4, %r9;
135
+ $L__tmp9:
136
+ .loc 2 74 34 // triton_helpers.py:74:34 @[ cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:24:51 ]
137
+ setp.ne.b64 %p10, %rd5, 0;
138
+ setp.ne.b64 %p11, %rd6, 0;
139
+ setp.ne.b64 %p12, %rd7, 0;
140
+ setp.ne.b64 %p13, %rd8, 0;
141
+ .loc 2 75 25 // triton_helpers.py:75:25 @[ cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:24:51 ]
142
+ setp.lt.s32 %p14, %r1, 0;
143
+ .loc 2 75 36 // triton_helpers.py:75:36 @[ cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:24:51 ]
144
+ setp.lt.s64 %p15, %rd27, 0;
145
+ .loc 2 75 32 // triton_helpers.py:75:32 @[ cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:24:51 ]
146
+ xor.pred %p16, %p14, %p15;
147
+ .loc 2 75 47 // triton_helpers.py:75:47 @[ cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:24:51 ]
148
+ and.pred %p17, %p16, %p13;
149
+ selp.b64 %rd47, -1, 0, %p17;
150
+ add.s64 %rd48, %rd82, %rd47;
151
+ and.pred %p18, %p16, %p12;
152
+ selp.b64 %rd49, -1, 0, %p18;
153
+ add.s64 %rd50, %rd83, %rd49;
154
+ and.pred %p19, %p16, %p11;
155
+ selp.b64 %rd51, -1, 0, %p19;
156
+ add.s64 %rd52, %rd84, %rd51;
157
+ and.pred %p20, %p16, %p10;
158
+ selp.b64 %rd53, -1, 0, %p20;
159
+ add.s64 %rd54, %rd85, %rd53;
160
+ $L__tmp10:
161
+ .loc 1 25 30 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:25:30
162
+ mad.wide.s32 %rd37, %r4, 4, %rd25;
163
+ .loc 1 25 35 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:25:35
164
+ // begin inline asm
165
+ mov.u64 %rd38, 0x0;
166
+ createpolicy.fractional.L2::evict_last.b64 %rd38, 1.0;
167
+ // end inline asm
168
+ // begin inline asm
169
+ mov.u32 %r24, 0x0;
170
+ mov.u32 %r25, 0x0;
171
+ mov.u32 %r26, 0x0;
172
+ mov.u32 %r27, 0x0;
173
+ @%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r24, %r25, %r26, %r27 }, [ %rd37 + 0 ], %rd38;
174
+ // end inline asm
175
+ cvt.u64.u32 %rd55, %r24;
176
+ cvt.u64.u32 %rd56, %r25;
177
+ shl.b64 %rd57, %rd56, 32;
178
+ or.b64 %rd58, %rd55, %rd57;
179
+ cvt.u64.u32 %rd59, %r26;
180
+ cvt.u64.u32 %rd60, %r27;
181
+ shl.b64 %rd61, %rd60, 32;
182
+ or.b64 %rd62, %rd59, %rd61;
183
+ .loc 1 27 18 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:27:18
184
+ mov.b64 {%r37, %r38}, %rd62;
185
+ mul.f32 %r39, %r38, 0f3F317218;
186
+ mul.f32 %r40, %r37, 0f3F317218;
187
+ mov.b64 {%r41, %r42}, %rd58;
188
+ mul.f32 %r43, %r42, 0f3F317218;
189
+ mul.f32 %r44, %r41, 0f3F317218;
190
+ .loc 1 28 49 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:28:49
191
+ setp.lt.s64 %p21, %rd27, 2;
192
+ .loc 1 28 75 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:28:75
193
+ setp.gt.s64 %p22, %rd27, 1;
194
+ .loc 1 28 66 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:28:66
195
+ selp.b64 %rd63, %rd27, 0, %p22;
196
+ .loc 1 28 0 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:28
197
+ selp.b64 %rd64, 1, 0, %p21;
198
+ .loc 1 28 57 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:28:57
199
+ add.s64 %rd65, %rd63, %rd64;
200
+ .loc 1 28 34 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:28:34
201
+ mul.lo.s64 %rd66, %rd48, %rd65;
202
+ mul.lo.s64 %rd67, %rd50, %rd65;
203
+ mul.lo.s64 %rd68, %rd52, %rd65;
204
+ mul.lo.s64 %rd69, %rd54, %rd65;
205
+ .loc 1 28 25 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:28:25
206
+ shl.b64 %rd70, %rd8, 2;
207
+ add.s64 %rd71, %rd26, %rd70;
208
+ shl.b64 %rd72, %rd66, 2;
209
+ add.s64 %rd39, %rd71, %rd72;
210
+ shl.b64 %rd73, %rd7, 2;
211
+ add.s64 %rd74, %rd26, %rd73;
212
+ shl.b64 %rd75, %rd67, 2;
213
+ add.s64 %rd40, %rd74, %rd75;
214
+ shl.b64 %rd76, %rd6, 2;
215
+ add.s64 %rd77, %rd26, %rd76;
216
+ shl.b64 %rd78, %rd68, 2;
217
+ add.s64 %rd41, %rd77, %rd78;
218
+ shl.b64 %rd79, %rd5, 2;
219
+ add.s64 %rd80, %rd26, %rd79;
220
+ shl.b64 %rd81, %rd69, 2;
221
+ add.s64 %rd42, %rd80, %rd81;
222
+ .loc 1 28 88 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:28:88
223
+ shl.b32 %r45, %r3, 4;
224
+ mov.b32 %r46, global_smem;
225
+ add.s32 %r47, %r46, %r45;
226
+ st.shared.v4.b32 [%r47], {%r44, %r43, %r40, %r39};
227
+ bar.sync 0;
228
+ shl.b32 %r48, %r2, 6;
229
+ and.b32 %r49, %r48, 1536;
230
+ shl.b32 %r50, %r2, 4;
231
+ and.b32 %r51, %r50, 112;
232
+ shl.b32 %r52, %r2, 2;
233
+ and.b32 %r53, %r52, 384;
234
+ add.s32 %r54, %r46, %r49;
235
+ add.s32 %r55, %r54, %r51;
236
+ add.s32 %r32, %r55, %r53;
237
+ // begin inline asm
238
+ ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r33, %r34, %r35, %r36}, [%r32];
239
+ // end inline asm
240
+ // begin inline asm
241
+ @%p6 st.global.b32 [ %rd39 + 0 ], { %r33 };
242
+ // end inline asm
243
+ // begin inline asm
244
+ @%p7 st.global.b32 [ %rd40 + 0 ], { %r34 };
245
+ // end inline asm
246
+ // begin inline asm
247
+ @%p8 st.global.b32 [ %rd41 + 0 ], { %r35 };
248
+ // end inline asm
249
+ // begin inline asm
250
+ @%p9 st.global.b32 [ %rd42 + 0 ], { %r36 };
251
+ // end inline asm
252
+ .loc 1 28 4 // cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py:28:4
253
+ ret;
254
+ $L__tmp11:
255
+ $L__func_end0:
256
+ // -- End function
257
+ }
258
+ .file 1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py"
259
+ .file 2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
260
+ .section .debug_abbrev
261
+ {
262
+ .b8 1 // Abbreviation Code
263
+ .b8 17 // DW_TAG_compile_unit
264
+ .b8 1 // DW_CHILDREN_yes
265
+ .b8 37 // DW_AT_producer
266
+ .b8 8 // DW_FORM_string
267
+ .b8 19 // DW_AT_language
268
+ .b8 5 // DW_FORM_data2
269
+ .b8 3 // DW_AT_name
270
+ .b8 8 // DW_FORM_string
271
+ .b8 16 // DW_AT_stmt_list
272
+ .b8 6 // DW_FORM_data4
273
+ .b8 27 // DW_AT_comp_dir
274
+ .b8 8 // DW_FORM_string
275
+ .b8 0 // EOM(1)
276
+ .b8 0 // EOM(2)
277
+ .b8 2 // Abbreviation Code
278
+ .b8 46 // DW_TAG_subprogram
279
+ .b8 0 // DW_CHILDREN_no
280
+ .b8 3 // DW_AT_name
281
+ .b8 8 // DW_FORM_string
282
+ .b8 32 // DW_AT_inline
283
+ .b8 11 // DW_FORM_data1
284
+ .b8 0 // EOM(1)
285
+ .b8 0 // EOM(2)
286
+ .b8 3 // Abbreviation Code
287
+ .b8 46 // DW_TAG_subprogram
288
+ .b8 1 // DW_CHILDREN_yes
289
+ .b8 17 // DW_AT_low_pc
290
+ .b8 1 // DW_FORM_addr
291
+ .b8 18 // DW_AT_high_pc
292
+ .b8 1 // DW_FORM_addr
293
+ .b8 49 // DW_AT_abstract_origin
294
+ .b8 19 // DW_FORM_ref4
295
+ .b8 0 // EOM(1)
296
+ .b8 0 // EOM(2)
297
+ .b8 4 // Abbreviation Code
298
+ .b8 29 // DW_TAG_inlined_subroutine
299
+ .b8 0 // DW_CHILDREN_no
300
+ .b8 49 // DW_AT_abstract_origin
301
+ .b8 19 // DW_FORM_ref4
302
+ .b8 17 // DW_AT_low_pc
303
+ .b8 1 // DW_FORM_addr
304
+ .b8 18 // DW_AT_high_pc
305
+ .b8 1 // DW_FORM_addr
306
+ .b8 88 // DW_AT_call_file
307
+ .b8 11 // DW_FORM_data1
308
+ .b8 89 // DW_AT_call_line
309
+ .b8 11 // DW_FORM_data1
310
+ .b8 87 // DW_AT_call_column
311
+ .b8 11 // DW_FORM_data1
312
+ .b8 0 // EOM(1)
313
+ .b8 0 // EOM(2)
314
+ .b8 0 // EOM(3)
315
+ }
316
+ .section .debug_info
317
+ {
318
+ .b32 211 // Length of Unit
319
+ .b8 2 // DWARF version number
320
+ .b8 0
321
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
322
+ .b8 8 // Address Size (in bytes)
323
+ .b8 1 // Abbrev [1] 0xb:0xcc DW_TAG_compile_unit
324
+ .b8 116 // DW_AT_producer
325
+ .b8 114
326
+ .b8 105
327
+ .b8 116
328
+ .b8 111
329
+ .b8 110
330
+ .b8 0
331
+ .b8 2 // DW_AT_language
332
+ .b8 0
333
+ .b8 99 // DW_AT_name
334
+ .b8 121
335
+ .b8 98
336
+ .b8 117
337
+ .b8 105
338
+ .b8 102
339
+ .b8 107
340
+ .b8 98
341
+ .b8 109
342
+ .b8 101
343
+ .b8 53
344
+ .b8 112
345
+ .b8 113
346
+ .b8 101
347
+ .b8 99
348
+ .b8 55
349
+ .b8 104
350
+ .b8 113
351
+ .b8 112
352
+ .b8 108
353
+ .b8 122
354
+ .b8 103
355
+ .b8 111
356
+ .b8 115
357
+ .b8 106
358
+ .b8 119
359
+ .b8 116
360
+ .b8 109
361
+ .b8 112
362
+ .b8 120
363
+ .b8 98
364
+ .b8 111
365
+ .b8 51
366
+ .b8 117
367
+ .b8 98
368
+ .b8 104
369
+ .b8 118
370
+ .b8 97
371
+ .b8 111
372
+ .b8 98
373
+ .b8 99
374
+ .b8 104
375
+ .b8 103
376
+ .b8 114
377
+ .b8 122
378
+ .b8 116
379
+ .b8 117
380
+ .b8 112
381
+ .b8 121
382
+ .b8 104
383
+ .b8 107
384
+ .b8 100
385
+ .b8 46
386
+ .b8 112
387
+ .b8 121
388
+ .b8 0
389
+ .b32 .debug_line // DW_AT_stmt_list
390
+ .b8 47 // DW_AT_comp_dir
391
+ .b8 119
392
+ .b8 111
393
+ .b8 114
394
+ .b8 107
395
+ .b8 115
396
+ .b8 112
397
+ .b8 97
398
+ .b8 99
399
+ .b8 101
400
+ .b8 47
401
+ .b8 104
402
+ .b8 97
403
+ .b8 110
404
+ .b8 114
405
+ .b8 117
406
+ .b8 105
407
+ .b8 47
408
+ .b8 106
409
+ .b8 117
410
+ .b8 110
411
+ .b8 113
412
+ .b8 117
413
+ .b8 97
414
+ .b8 110
415
+ .b8 47
416
+ .b8 83
417
+ .b8 112
418
+ .b8 101
419
+ .b8 99
420
+ .b8 70
421
+ .b8 111
422
+ .b8 114
423
+ .b8 103
424
+ .b8 101
425
+ .b8 47
426
+ .b8 99
427
+ .b8 97
428
+ .b8 99
429
+ .b8 104
430
+ .b8 101
431
+ .b8 47
432
+ .b8 99
433
+ .b8 111
434
+ .b8 109
435
+ .b8 112
436
+ .b8 105
437
+ .b8 108
438
+ .b8 101
439
+ .b8 100
440
+ .b8 95
441
+ .b8 107
442
+ .b8 101
443
+ .b8 114
444
+ .b8 110
445
+ .b8 101
446
+ .b8 108
447
+ .b8 115
448
+ .b8 47
449
+ .b8 121
450
+ .b8 98
451
+ .b8 0
452
+ .b8 2 // Abbrev [2] 0x8f:0x19 DW_TAG_subprogram
453
+ .b8 116 // DW_AT_name
454
+ .b8 114
455
+ .b8 105
456
+ .b8 116
457
+ .b8 111
458
+ .b8 110
459
+ .b8 95
460
+ .b8 112
461
+ .b8 111
462
+ .b8 105
463
+ .b8 95
464
+ .b8 102
465
+ .b8 117
466
+ .b8 115
467
+ .b8 101
468
+ .b8 100
469
+ .b8 95
470
+ .b8 109
471
+ .b8 117
472
+ .b8 108
473
+ .b8 95
474
+ .b8 49
475
+ .b8 0
476
+ .b8 1 // DW_AT_inline
477
+ .b8 3 // Abbrev [3] 0xa8:0x2e DW_TAG_subprogram
478
+ .b64 $L__func_begin0 // DW_AT_low_pc
479
+ .b64 $L__func_end0 // DW_AT_high_pc
480
+ .b32 143 // DW_AT_abstract_origin
481
+ .b8 4 // Abbrev [4] 0xbd:0x18 DW_TAG_inlined_subroutine
482
+ .b32 143 // DW_AT_abstract_origin
483
+ .b64 $L__tmp1 // DW_AT_low_pc
484
+ .b64 $L__tmp10 // DW_AT_high_pc
485
+ .b8 1 // DW_AT_call_file
486
+ .b8 24 // DW_AT_call_line
487
+ .b8 51 // DW_AT_call_column
488
+ .b8 0 // End Of Children Mark
489
+ .b8 0 // End Of Children Mark
490
+ }
491
+ .section .debug_macinfo { }
progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.source ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":18:0)
2
+ #loc22 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0)
3
+ #loc34 = loc("in_ptr0"(#loc))
4
+ #loc35 = loc("out_ptr0"(#loc))
5
+ #loc36 = loc("ks0"(#loc))
6
+ #loc37 = loc("xnumel"(#loc))
7
+ #loc49 = loc("a"(#loc22))
8
+ #loc50 = loc("b"(#loc22))
9
+ module {
10
+ tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
11
+ %xoffset = tt.get_program_id x : i32 loc(#loc38)
12
+ %xoffset_0 = arith.constant 512 : i32 loc(#loc39)
13
+ %xoffset_1 = arith.constant 512 : i32 loc(#loc39)
14
+ %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc39)
15
+ %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc40)
16
+ %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc41)
17
+ %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc41)
18
+ %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc42)
19
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<512xi32> loc(#loc42)
20
+ %x0 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc43)
21
+ %x0_6 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc43)
22
+ %x0_7 = arith.remsi %x0, %x0_6 : tensor<512xi64> loc(#loc43)
23
+ %x1 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S512S_i64__(%xindex_4, %ks0) : (tensor<512xi32>, i64) -> tensor<512xi64> loc(#loc44)
24
+ %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>> loc(#loc45)
25
+ %tmp0_8 = tt.addptr %tmp0, %xindex_4 : tensor<512x!tt.ptr<f32>>, tensor<512xi32> loc(#loc45)
26
+ %tmp0_9 = tt.load %tmp0_8, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr<f32>> loc(#loc46)
27
+ %tmp1 = arith.constant 0.693147182 : f32 loc(#loc47)
28
+ %tmp2 = arith.constant dense<0.693147182> : tensor<512xf32> loc(#loc48)
29
+ %tmp2_10 = arith.mulf %tmp0_9, %tmp2 : tensor<512xf32> loc(#loc48)
30
+ %c1_i32 = arith.constant 1 : i32 loc(#loc12)
31
+ %0 = arith.extsi %c1_i32 : i32 to i64 loc(#loc12)
32
+ %1 = arith.cmpi sge, %0, %ks0 : i64 loc(#loc12)
33
+ %c1_i32_11 = arith.constant 1 : i32 loc(#loc13)
34
+ %c1_i32_12 = arith.constant 1 : i32 loc(#loc13)
35
+ %2 = arith.extui %1 : i1 to i32 loc(#loc13)
36
+ %3 = arith.muli %c1_i32_12, %2 : i32 loc(#loc13)
37
+ %c1_i32_13 = arith.constant 1 : i32 loc(#loc14)
38
+ %4 = arith.extsi %c1_i32_13 : i32 to i64 loc(#loc14)
39
+ %5 = arith.cmpi sgt, %ks0, %4 : i64 loc(#loc14)
40
+ %6 = arith.extui %5 : i1 to i64 loc(#loc15)
41
+ %7 = arith.muli %ks0, %6 : i64 loc(#loc15)
42
+ %8 = arith.extsi %3 : i32 to i64 loc(#loc16)
43
+ %9 = arith.addi %8, %7 : i64 loc(#loc16)
44
+ %10 = tt.splat %9 : i64 -> tensor<512xi64> loc(#loc17)
45
+ %11 = arith.muli %x1, %10 : tensor<512xi64> loc(#loc17)
46
+ %12 = arith.addi %x0_7, %11 : tensor<512xi64> loc(#loc18)
47
+ %13 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>> loc(#loc19)
48
+ %14 = tt.addptr %13, %12 : tensor<512x!tt.ptr<f32>>, tensor<512xi64> loc(#loc19)
49
+ tt.store %14, %tmp2_10, %xmask_5 : tensor<512x!tt.ptr<f32>> loc(#loc20)
50
+ tt.return loc(#loc21)
51
+ } loc(#loc)
52
+ tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S512S_i64__(%a: tensor<512xi32> loc("a"(#loc22)), %b: i64 loc("b"(#loc22))) -> tensor<512xi64> attributes {noinline = false} {
53
+ %quot = arith.extsi %a : tensor<512xi32> to tensor<512xi64> loc(#loc51)
54
+ %quot_0 = tt.splat %b : i64 -> tensor<512xi64> loc(#loc51)
55
+ %quot_1 = arith.divsi %quot, %quot_0 : tensor<512xi64> loc(#loc51)
56
+ %remainder = arith.extsi %a : tensor<512xi32> to tensor<512xi64> loc(#loc52)
57
+ %remainder_2 = tt.splat %b : i64 -> tensor<512xi64> loc(#loc52)
58
+ %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<512xi64> loc(#loc52)
59
+ %fixed = arith.constant 0 : i32 loc(#loc53)
60
+ %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc53)
61
+ %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<512xi64> loc(#loc53)
62
+ %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<512xi64> loc(#loc53)
63
+ %fixed_7 = arith.constant 1 : i32 loc(#loc54)
64
+ %fixed_8 = arith.constant 1 : i64 loc(#loc54)
65
+ %fixed_9 = arith.constant dense<1> : tensor<512xi64> loc(#loc54)
66
+ %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<512xi64> loc(#loc54)
67
+ %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<512xi1>, tensor<512xi64> loc(#loc55)
68
+ %c0_i32 = arith.constant 0 : i32 loc(#loc28)
69
+ %cst = arith.constant dense<0> : tensor<512xi32> loc(#loc28)
70
+ %0 = arith.cmpi slt, %a, %cst : tensor<512xi32> loc(#loc28)
71
+ %c0_i32_12 = arith.constant 0 : i32 loc(#loc29)
72
+ %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc29)
73
+ %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc29)
74
+ %3 = tt.splat %2 : i1 -> tensor<512xi1> loc(#loc30)
75
+ %4 = arith.cmpi ne, %0, %3 : tensor<512xi1> loc(#loc30)
76
+ %5 = arith.select %4, %fixed_11, %quot_1 : tensor<512xi1>, tensor<512xi64> loc(#loc31)
77
+ tt.return %5 : tensor<512xi64> loc(#loc32)
78
+ ^bb1: // no predecessors
79
+ %6 = ub.poison : tensor<512xi64> loc(#loc33)
80
+ tt.return %6 : tensor<512xi64> loc(#loc33)
81
+ } loc(#loc22)
82
+ } loc(#loc)
83
+ #loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":19:28)
84
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":19:33)
85
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":20:36)
86
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":20:23)
87
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":21:21)
88
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":23:19)
89
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":24:51)
90
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":25:30)
91
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":25:35)
92
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":26:11)
93
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":27:18)
94
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:49)
95
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:41)
96
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:75)
97
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:66)
98
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:57)
99
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:34)
100
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:30)
101
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:25)
102
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:88)
103
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:4)
104
+ #loc23 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
105
+ #loc24 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20)
106
+ #loc25 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
107
+ #loc26 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
108
+ #loc27 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
109
+ #loc28 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
110
+ #loc29 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
111
+ #loc30 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
112
+ #loc31 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
113
+ #loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11)
114
+ #loc33 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4)
115
+ #loc38 = loc("xoffset"(#loc1))
116
+ #loc39 = loc("xoffset"(#loc2))
117
+ #loc40 = loc("xindex"(#loc3))
118
+ #loc41 = loc("xindex"(#loc4))
119
+ #loc42 = loc("xmask"(#loc5))
120
+ #loc43 = loc("x0"(#loc6))
121
+ #loc44 = loc("x1"(#loc7))
122
+ #loc45 = loc("tmp0"(#loc8))
123
+ #loc46 = loc("tmp0"(#loc9))
124
+ #loc47 = loc("tmp1"(#loc10))
125
+ #loc48 = loc("tmp2"(#loc11))
126
+ #loc51 = loc("quot"(#loc23))
127
+ #loc52 = loc("remainder"(#loc24))
128
+ #loc53 = loc("fixed"(#loc25))
129
+ #loc54 = loc("fixed"(#loc26))
130
+ #loc55 = loc("fixed"(#loc27))
progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.ttgir ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
2
+ #blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
3
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":18:0)
4
+ #loc30 = loc("in_ptr0"(#loc))
5
+ #loc31 = loc("out_ptr0"(#loc))
6
+ #loc32 = loc("ks0"(#loc))
7
+ #loc33 = loc("xnumel"(#loc))
8
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
9
+ tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
10
+ %cst = arith.constant dense<0.693147182> : tensor<512xf32, #blocked> loc(#loc1)
11
+ %c1_i64 = arith.constant 1 : i64 loc(#loc1)
12
+ %c0_i64 = arith.constant 0 : i64 loc(#loc1)
13
+ %cst_0 = arith.constant dense<0> : tensor<512xi64, #blocked1> loc(#loc1)
14
+ %cst_1 = arith.constant dense<0> : tensor<512xi32, #blocked1> loc(#loc1)
15
+ %cst_2 = arith.constant dense<1> : tensor<512xi64, #blocked1> loc(#loc1)
16
+ %c512_i32 = arith.constant 512 : i32 loc(#loc1)
17
+ %xoffset = tt.get_program_id x : i32 loc(#loc34)
18
+ %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc35)
19
+ %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc36)
20
+ %xindex_4 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked1> loc(#loc36)
21
+ %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc37)
22
+ %xindex_6 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked1> loc(#loc37)
23
+ %xindex_7 = arith.addi %xindex_5, %xindex : tensor<512xi32, #blocked> loc(#loc37)
24
+ %xindex_8 = arith.addi %xindex_6, %xindex_4 : tensor<512xi32, #blocked1> loc(#loc37)
25
+ %xmask = tt.splat %xnumel : i32 -> tensor<512xi32, #blocked> loc(#loc38)
26
+ %xmask_9 = tt.splat %xnumel : i32 -> tensor<512xi32, #blocked1> loc(#loc38)
27
+ %xmask_10 = arith.cmpi slt, %xindex_7, %xmask : tensor<512xi32, #blocked> loc(#loc38)
28
+ %xmask_11 = arith.cmpi slt, %xindex_8, %xmask_9 : tensor<512xi32, #blocked1> loc(#loc38)
29
+ %x0 = arith.extsi %xindex_8 : tensor<512xi32, #blocked1> to tensor<512xi64, #blocked1> loc(#loc39)
30
+ %x0_12 = tt.splat %ks0 : i64 -> tensor<512xi64, #blocked1> loc(#loc39)
31
+ %x0_13 = arith.remsi %x0, %x0_12 : tensor<512xi64, #blocked1> loc(#loc39)
32
+ %quot = arith.divsi %x0, %x0_12 : tensor<512xi64, #blocked1> loc(#loc49)
33
+ %fixed = arith.cmpi ne, %x0_13, %cst_0 : tensor<512xi64, #blocked1> loc(#loc50)
34
+ %fixed_14 = arith.subi %quot, %cst_2 : tensor<512xi64, #blocked1> loc(#loc51)
35
+ %fixed_15 = arith.select %fixed, %fixed_14, %quot : tensor<512xi1, #blocked1>, tensor<512xi64, #blocked1> loc(#loc52)
36
+ %x1 = arith.cmpi slt, %xindex_8, %cst_1 : tensor<512xi32, #blocked1> loc(#loc53)
37
+ %x1_16 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc54)
38
+ %x1_17 = tt.splat %x1_16 : i1 -> tensor<512xi1, #blocked1> loc(#loc55)
39
+ %x1_18 = arith.cmpi ne, %x1, %x1_17 : tensor<512xi1, #blocked1> loc(#loc55)
40
+ %x1_19 = arith.select %x1_18, %fixed_15, %quot : tensor<512xi1, #blocked1>, tensor<512xi64, #blocked1> loc(#loc56)
41
+ %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>, #blocked> loc(#loc45)
42
+ %tmp0_20 = tt.addptr %tmp0, %xindex_7 : tensor<512x!tt.ptr<f32>, #blocked>, tensor<512xi32, #blocked> loc(#loc45)
43
+ %tmp0_21 = tt.load %tmp0_20, %xmask_10 evictionPolicy = evict_last : tensor<512x!tt.ptr<f32>, #blocked> loc(#loc46)
44
+ %tmp2 = arith.mulf %tmp0_21, %cst : tensor<512xf32, #blocked> loc(#loc47)
45
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc20)
46
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc21)
47
+ %2 = arith.extui %1 : i1 to i64 loc(#loc22)
48
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc22)
49
+ %4 = arith.extui %0 : i1 to i64 loc(#loc48)
50
+ %5 = arith.addi %4, %3 : i64 loc(#loc23)
51
+ %6 = tt.splat %5 : i64 -> tensor<512xi64, #blocked1> loc(#loc25)
52
+ %7 = arith.muli %x1_19, %6 : tensor<512xi64, #blocked1> loc(#loc25)
53
+ %8 = arith.addi %x0_13, %7 : tensor<512xi64, #blocked1> loc(#loc26)
54
+ %9 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>, #blocked1> loc(#loc27)
55
+ %10 = tt.addptr %9, %8 : tensor<512x!tt.ptr<f32>, #blocked1>, tensor<512xi64, #blocked1> loc(#loc27)
56
+ %11 = ttg.convert_layout %tmp2 : tensor<512xf32, #blocked> -> tensor<512xf32, #blocked1> loc(#loc28)
57
+ tt.store %10, %11, %xmask_11 : tensor<512x!tt.ptr<f32>, #blocked1> loc(#loc28)
58
+ tt.return loc(#loc29)
59
+ } loc(#loc)
60
+ } loc(#loc)
61
+ #loc1 = loc(unknown)
62
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":19:28)
63
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":19:33)
64
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":20:36)
65
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":20:23)
66
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":21:21)
67
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":23:19)
68
+ #loc8 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
69
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":24:51)
70
+ #loc10 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
71
+ #loc11 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
72
+ #loc12 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
73
+ #loc13 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
74
+ #loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
75
+ #loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
76
+ #loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
77
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":25:30)
78
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":25:35)
79
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":27:18)
80
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:49)
81
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:75)
82
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:66)
83
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:57)
84
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:41)
85
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:34)
86
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:30)
87
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:25)
88
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:88)
89
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:4)
90
+ #loc34 = loc("xoffset"(#loc2))
91
+ #loc35 = loc("xoffset"(#loc3))
92
+ #loc36 = loc("xindex"(#loc4))
93
+ #loc37 = loc("xindex"(#loc5))
94
+ #loc38 = loc("xmask"(#loc6))
95
+ #loc39 = loc("x0"(#loc7))
96
+ #loc40 = loc("quot"(#loc8))
97
+ #loc41 = loc("x1"(#loc9))
98
+ #loc42 = loc("fixed"(#loc10))
99
+ #loc43 = loc("fixed"(#loc11))
100
+ #loc44 = loc("fixed"(#loc12))
101
+ #loc45 = loc("tmp0"(#loc17))
102
+ #loc46 = loc("tmp0"(#loc18))
103
+ #loc47 = loc("tmp2"(#loc19))
104
+ #loc48 = loc(fused[#loc23, #loc24])
105
+ #loc49 = loc(callsite(#loc40 at #loc41))
106
+ #loc50 = loc(callsite(#loc42 at #loc41))
107
+ #loc51 = loc(callsite(#loc43 at #loc41))
108
+ #loc52 = loc(callsite(#loc44 at #loc41))
109
+ #loc53 = loc(callsite(#loc13 at #loc41))
110
+ #loc54 = loc(callsite(#loc14 at #loc41))
111
+ #loc55 = loc(callsite(#loc15 at #loc41))
112
+ #loc56 = loc(callsite(#loc16 at #loc41))
progress/github/SpecForge/cache/compiled_kernels/triton/3/5V7HRCMCWIP4YV6CEPBLBEAZBTHHDVZJ6PH5PEGDK4M2L674ASDQ/triton_poi_fused_mul_1.ttir ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":18:0)
2
+ #loc30 = loc("in_ptr0"(#loc))
3
+ #loc31 = loc("out_ptr0"(#loc))
4
+ #loc32 = loc("ks0"(#loc))
5
+ #loc33 = loc("xnumel"(#loc))
6
+ module {
7
+ tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
8
+ %fixed = arith.constant dense<1> : tensor<512xi64> loc(#loc49)
9
+ %x1 = arith.constant dense<0> : tensor<512xi32> loc(#loc50)
10
+ %fixed_0 = arith.constant dense<0> : tensor<512xi64> loc(#loc51)
11
+ %x1_1 = arith.constant 0 : i64 loc(#loc52)
12
+ %c1_i64 = arith.constant 1 : i64 loc(#loc6)
13
+ %tmp2 = arith.constant dense<0.693147182> : tensor<512xf32> loc(#loc37)
14
+ %c512_i32 = arith.constant 512 : i32 loc(#loc6)
15
+ %xoffset = tt.get_program_id x : i32 loc(#loc38)
16
+ %xoffset_2 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc39)
17
+ %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc40)
18
+ %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc41)
19
+ %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc41)
20
+ %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc42)
21
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<512xi32> loc(#loc42)
22
+ %x0 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc43)
23
+ %x0_6 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc43)
24
+ %x0_7 = arith.remsi %x0, %x0_6 : tensor<512xi64> loc(#loc43)
25
+ %quot = arith.divsi %x0, %x0_6 : tensor<512xi64> loc(#loc53)
26
+ %fixed_8 = arith.cmpi ne, %x0_7, %fixed_0 : tensor<512xi64> loc(#loc51)
27
+ %fixed_9 = arith.subi %quot, %fixed : tensor<512xi64> loc(#loc49)
28
+ %fixed_10 = arith.select %fixed_8, %fixed_9, %quot : tensor<512xi1>, tensor<512xi64> loc(#loc54)
29
+ %x1_11 = arith.cmpi slt, %xindex_4, %x1 : tensor<512xi32> loc(#loc50)
30
+ %x1_12 = arith.cmpi slt, %ks0, %x1_1 : i64 loc(#loc52)
31
+ %x1_13 = tt.splat %x1_12 : i1 -> tensor<512xi1> loc(#loc55)
32
+ %x1_14 = arith.cmpi ne, %x1_11, %x1_13 : tensor<512xi1> loc(#loc55)
33
+ %x1_15 = arith.select %x1_14, %fixed_10, %quot : tensor<512xi1>, tensor<512xi64> loc(#loc56)
34
+ %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>> loc(#loc46)
35
+ %tmp0_16 = tt.addptr %tmp0, %xindex_4 : tensor<512x!tt.ptr<f32>>, tensor<512xi32> loc(#loc46)
36
+ %tmp0_17 = tt.load %tmp0_16, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr<f32>> loc(#loc47)
37
+ %tmp2_18 = arith.mulf %tmp0_17, %tmp2 : tensor<512xf32> loc(#loc37)
38
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc20)
39
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc21)
40
+ %2 = arith.extui %1 : i1 to i64 loc(#loc22)
41
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc22)
42
+ %4 = arith.extui %0 : i1 to i64 loc(#loc48)
43
+ %5 = arith.addi %4, %3 : i64 loc(#loc23)
44
+ %6 = tt.splat %5 : i64 -> tensor<512xi64> loc(#loc25)
45
+ %7 = arith.muli %x1_15, %6 : tensor<512xi64> loc(#loc25)
46
+ %8 = arith.addi %x0_7, %7 : tensor<512xi64> loc(#loc26)
47
+ %9 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>> loc(#loc27)
48
+ %10 = tt.addptr %9, %8 : tensor<512x!tt.ptr<f32>>, tensor<512xi64> loc(#loc27)
49
+ tt.store %10, %tmp2_18, %xmask_5 : tensor<512x!tt.ptr<f32>> loc(#loc28)
50
+ tt.return loc(#loc29)
51
+ } loc(#loc)
52
+ } loc(#loc)
53
+ #loc1 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
54
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":24:51)
55
+ #loc3 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
56
+ #loc4 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
57
+ #loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
58
+ #loc6 = loc(unknown)
59
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":27:18)
60
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":19:28)
61
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":19:33)
62
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":20:36)
63
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":20:23)
64
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":21:21)
65
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":23:19)
66
+ #loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
67
+ #loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
68
+ #loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
69
+ #loc17 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
70
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":25:30)
71
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":25:35)
72
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:49)
73
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:75)
74
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:66)
75
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:57)
76
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:41)
77
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:34)
78
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:30)
79
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:25)
80
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:88)
81
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/yb/cybuifkbme5pqec7hqplzgosjwtmpxbo3ubhvaobchgrztupyhkd.py":28:4)
82
+ #loc34 = loc("fixed"(#loc1))
83
+ #loc35 = loc("x1"(#loc2))
84
+ #loc36 = loc("fixed"(#loc4))
85
+ #loc37 = loc("tmp2"(#loc7))
86
+ #loc38 = loc("xoffset"(#loc8))
87
+ #loc39 = loc("xoffset"(#loc9))
88
+ #loc40 = loc("xindex"(#loc10))
89
+ #loc41 = loc("xindex"(#loc11))
90
+ #loc42 = loc("xmask"(#loc12))
91
+ #loc43 = loc("x0"(#loc13))
92
+ #loc44 = loc("quot"(#loc14))
93
+ #loc45 = loc("fixed"(#loc15))
94
+ #loc46 = loc("tmp0"(#loc18))
95
+ #loc47 = loc("tmp0"(#loc19))
96
+ #loc48 = loc(fused[#loc23, #loc24])
97
+ #loc49 = loc(callsite(#loc34 at #loc35))
98
+ #loc50 = loc(callsite(#loc3 at #loc35))
99
+ #loc51 = loc(callsite(#loc36 at #loc35))
100
+ #loc52 = loc(callsite(#loc5 at #loc35))
101
+ #loc53 = loc(callsite(#loc44 at #loc35))
102
+ #loc54 = loc(callsite(#loc45 at #loc35))
103
+ #loc55 = loc(callsite(#loc16 at #loc35))
104
+ #loc56 = loc(callsite(#loc17 at #loc35))
progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/__grp__triton_red_fused_mul_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_red_fused_mul_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.source", "triton_red_fused_mul_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ttir", "triton_red_fused_mul_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ttgir", "triton_red_fused_mul_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.llir", "triton_red_fused_mul_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ptx", "triton_red_fused_mul_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.cubin", "triton_red_fused_mul_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.json"}}
progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.cubin ADDED
Binary file (16.3 kB). View file
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "f923b0567974fe2b90222adc62f5264539f57bab3a964b0b1b908af49d3e34ed", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_mul_0"}
progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.llir ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
6
+
7
+ ; Function Attrs: nounwind
8
+ define ptx_kernel void @triton_red_fused_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i64 %4, i32 %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 {
9
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
10
+ %11 = shl i32 %10, 2, !dbg !8
11
+ %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
12
+ %13 = and i32 %12, 96, !dbg !9
13
+ %14 = lshr exact i32 %13, 5, !dbg !9
14
+ %15 = and i32 %12, 3, !dbg !9
15
+ %16 = or disjoint i32 %14, %11, !dbg !10
16
+ %17 = or disjoint i32 %11, %15, !dbg !10
17
+ %18 = icmp slt i32 %16, %5, !dbg !11
18
+ %19 = icmp slt i32 %17, %5, !dbg !11
19
+ %20 = shl nuw nsw i32 %12, 2, !dbg !12
20
+ %21 = and i32 %20, 124, !dbg !12
21
+ %22 = sext i32 %16 to i64, !dbg !13
22
+ %23 = sext i32 %17 to i64, !dbg !13
23
+ %.frozen = freeze i64 %4, !dbg !14
24
+ %24 = sdiv i64 %22, %.frozen, !dbg !14
25
+ %25 = mul i64 %24, %.frozen, !dbg !13
26
+ %.decomposed = sub i64 %22, %25, !dbg !13
27
+ %.frozen10 = freeze i64 %4, !dbg !14
28
+ %26 = sdiv i64 %23, %.frozen10, !dbg !14
29
+ %27 = mul i64 %26, %.frozen10, !dbg !13
30
+ %.decomposed11 = sub i64 %23, %27, !dbg !13
31
+ %.not = icmp ne i64 %.decomposed, 0, !dbg !18
32
+ %.not1 = icmp ne i64 %.decomposed11, 0, !dbg !18
33
+ %28 = icmp slt i32 %11, 0, !dbg !19
34
+ %29 = icmp slt i64 %4, 0, !dbg !20
35
+ %30 = xor i1 %28, %29, !dbg !21
36
+ %narrow = select i1 %30, i1 %.not, i1 false, !dbg !22
37
+ %31 = sext i1 %narrow to i64, !dbg !22
38
+ %32 = add nsw i64 %24, %31, !dbg !22
39
+ %narrow2 = select i1 %30, i1 %.not1, i1 false, !dbg !22
40
+ %33 = sext i1 %narrow2 to i64, !dbg !22
41
+ %34 = add nsw i64 %26, %33, !dbg !22
42
+ %35 = shl nsw i64 %32, 7, !dbg !23
43
+ %36 = zext nneg i32 %21 to i64, !dbg !24
44
+ %37 = getelementptr bfloat, ptr addrspace(1) %0, i64 %35, !dbg !25
45
+ %38 = getelementptr bfloat, ptr addrspace(1) %37, i64 %36, !dbg !25
46
+ %.idx = shl nsw i64 %.decomposed, 13, !dbg !25
47
+ %39 = getelementptr i8, ptr addrspace(1) %38, i64 %.idx, !dbg !25
48
+ %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !26
49
+ %41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %39, i64 %40, i1 %18) #4, !dbg !26
50
+ %42 = extractvalue { i32, i32 } %41, 0, !dbg !26
51
+ %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !26
52
+ %44 = extractvalue { i32, i32 } %41, 1, !dbg !26
53
+ %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !26
54
+ %46 = icmp slt i64 %4, 2, !dbg !27
55
+ %47 = icmp sgt i64 %4, 1, !dbg !28
56
+ %48 = select i1 %47, i64 %4, i64 0, !dbg !29
57
+ %49 = zext i1 %46 to i64, !dbg !30
58
+ %50 = add i64 %48, %49, !dbg !31
59
+ %51 = mul i64 %35, %50, !dbg !32
60
+ %.idx3 = shl nsw i64 %.decomposed, 8, !dbg !33
61
+ %52 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx3, !dbg !33
62
+ %53 = getelementptr bfloat, ptr addrspace(1) %52, i64 %36, !dbg !33
63
+ %54 = getelementptr bfloat, ptr addrspace(1) %53, i64 %51, !dbg !33
64
+ %55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !34
65
+ %56 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %54, i64 %55, i1 %18) #4, !dbg !34
66
+ %57 = extractvalue { i32, i32 } %56, 0, !dbg !34
67
+ %58 = bitcast i32 %57 to <2 x bfloat>, !dbg !34
68
+ %59 = extractvalue { i32, i32 } %56, 1, !dbg !34
69
+ %60 = bitcast i32 %59 to <2 x bfloat>, !dbg !34
70
+ %61 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !35
71
+ %62 = fpext <2 x bfloat> %58 to <2 x float>, !dbg !36
72
+ %63 = fmul <2 x float> %61, %62, !dbg !37
73
+ %64 = fadd <2 x float> %63, zeroinitializer, !dbg !38
74
+ %65 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !35
75
+ %66 = fpext <2 x bfloat> %60 to <2 x float>, !dbg !36
76
+ %67 = fmul <2 x float> %65, %66, !dbg !37
77
+ %68 = fadd <2 x float> %67, zeroinitializer, !dbg !38
78
+ %shift = shufflevector <2 x float> %64, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !39
79
+ %foldExtExtBinop = fadd <2 x float> %64, %shift, !dbg !39
80
+ %foldExtExtBinop5 = fadd <2 x float> %68, %foldExtExtBinop, !dbg !39
81
+ %shift7 = shufflevector <2 x float> %68, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !39
82
+ %foldExtExtBinop8 = fadd <2 x float> %shift7, %foldExtExtBinop5, !dbg !39
83
+ %69 = extractelement <2 x float> %foldExtExtBinop8, i64 0, !dbg !39
84
+ %70 = select i1 %18, float %69, float 0.000000e+00, !dbg !39
85
+ %71 = bitcast float %70 to i32, !dbg !43
86
+ %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 16, i32 31), !dbg !43
87
+ %73 = bitcast i32 %72 to float, !dbg !43
88
+ %74 = fadd float %70, %73, !dbg !39
89
+ %75 = bitcast float %74 to i32, !dbg !43
90
+ %76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 8, i32 31), !dbg !43
91
+ %77 = bitcast i32 %76 to float, !dbg !43
92
+ %78 = fadd float %74, %77, !dbg !39
93
+ %79 = bitcast float %78 to i32, !dbg !43
94
+ %80 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %79, i32 4, i32 31), !dbg !43
95
+ %81 = bitcast i32 %80 to float, !dbg !43
96
+ %82 = fadd float %78, %81, !dbg !39
97
+ %83 = bitcast float %82 to i32, !dbg !43
98
+ %84 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %83, i32 2, i32 31), !dbg !43
99
+ %85 = bitcast i32 %84 to float, !dbg !43
100
+ %86 = fadd float %82, %85, !dbg !39
101
+ %87 = bitcast float %86 to i32, !dbg !43
102
+ %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !43
103
+ %89 = bitcast i32 %88 to float, !dbg !43
104
+ %90 = fadd float %86, %89, !dbg !39
105
+ %91 = lshr exact i32 %13, 3, !dbg !44
106
+ %92 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %91, !dbg !44
107
+ store float %90, ptr addrspace(3) %92, align 4, !dbg !44
108
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44
109
+ %93 = shl nuw nsw i32 %15, 2, !dbg !44
110
+ %94 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %93, !dbg !44
111
+ %95 = load float, ptr addrspace(3) %94, align 4, !dbg !44
112
+ %96 = mul i64 %34, %50, !dbg !45
113
+ %97 = getelementptr float, ptr addrspace(1) %2, i64 %.decomposed11, !dbg !46
114
+ %98 = getelementptr float, ptr addrspace(1) %97, i64 %96, !dbg !46
115
+ %99 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !47
116
+ %100 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %98, i64 %99, i1 %19) #4, !dbg !47
117
+ %101 = bitcast i32 %100 to float, !dbg !47
118
+ %102 = fmul float %101, 0x3FE62E4300000000, !dbg !48
119
+ %103 = fmul float %102, 0x3FF7154760000000, !dbg !49
120
+ %104 = fsub float %95, %103, !dbg !44
121
+ %105 = getelementptr float, ptr addrspace(1) %3, i64 %23, !dbg !50
122
+ %106 = and i32 %12, 124, !dbg !51
123
+ %107 = icmp eq i32 %106, 0, !dbg !51
124
+ %108 = bitcast float %104 to i32, !dbg !51
125
+ %109 = and i1 %107, %19, !dbg !51
126
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %108, ptr addrspace(1) %105, i1 %109) #4, !dbg !51
127
+ ret void, !dbg !52
128
+ }
129
+
130
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
131
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
132
+
133
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
134
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
135
+
136
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
137
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
138
+
139
+ ; Function Attrs: convergent nocallback nounwind
140
+ declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
141
+
142
+ attributes #0 = { nounwind "nvvm.reqntid"="128" }
143
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
144
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
145
+ attributes #3 = { convergent nocallback nounwind }
146
+ attributes #4 = { nounwind }
147
+
148
+ !llvm.dbg.cu = !{!0}
149
+ !llvm.module.flags = !{!2, !3}
150
+
151
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
152
+ !1 = !DIFile(filename: "c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q")
153
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
154
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
155
+ !4 = distinct !DISubprogram(name: "triton_red_fused_mul_0", linkageName: "triton_red_fused_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
156
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
157
+ !6 = !{}
158
+ !7 = !DILocation(line: 22, column: 28, scope: !4)
159
+ !8 = !DILocation(line: 22, column: 33, scope: !4)
160
+ !9 = !DILocation(line: 23, column: 44, scope: !4)
161
+ !10 = !DILocation(line: 23, column: 23, scope: !4)
162
+ !11 = !DILocation(line: 24, column: 21, scope: !4)
163
+ !12 = !DILocation(line: 25, column: 37, scope: !4)
164
+ !13 = !DILocation(line: 27, column: 19, scope: !4)
165
+ !14 = !DILocation(line: 72, column: 16, scope: !15, inlinedAt: !17)
166
+ !15 = distinct !DILexicalBlockFile(scope: !4, file: !16, discriminator: 0)
167
+ !16 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
168
+ !17 = !DILocation(line: 28, column: 51, scope: !4)
169
+ !18 = !DILocation(line: 74, column: 34, scope: !15, inlinedAt: !17)
170
+ !19 = !DILocation(line: 75, column: 25, scope: !15, inlinedAt: !17)
171
+ !20 = !DILocation(line: 75, column: 36, scope: !15, inlinedAt: !17)
172
+ !21 = !DILocation(line: 75, column: 32, scope: !15, inlinedAt: !17)
173
+ !22 = !DILocation(line: 75, column: 47, scope: !15, inlinedAt: !17)
174
+ !23 = !DILocation(line: 37, column: 45, scope: !4)
175
+ !24 = !DILocation(line: 37, column: 41, scope: !4)
176
+ !25 = !DILocation(line: 37, column: 34, scope: !4)
177
+ !26 = !DILocation(line: 37, column: 60, scope: !4)
178
+ !27 = !DILocation(line: 38, column: 73, scope: !4)
179
+ !28 = !DILocation(line: 38, column: 99, scope: !4)
180
+ !29 = !DILocation(line: 38, column: 90, scope: !4)
181
+ !30 = !DILocation(line: 38, scope: !4)
182
+ !31 = !DILocation(line: 38, column: 81, scope: !4)
183
+ !32 = !DILocation(line: 38, column: 58, scope: !4)
184
+ !33 = !DILocation(line: 38, column: 34, scope: !4)
185
+ !34 = !DILocation(line: 38, column: 106, scope: !4)
186
+ !35 = !DILocation(line: 37, column: 122, scope: !4)
187
+ !36 = !DILocation(line: 38, column: 168, scope: !4)
188
+ !37 = !DILocation(line: 39, column: 22, scope: !4)
189
+ !38 = !DILocation(line: 41, column: 23, scope: !4)
190
+ !39 = !DILocation(line: 261, column: 15, scope: !40, inlinedAt: !42)
191
+ !40 = distinct !DILexicalBlockFile(scope: !4, file: !41, discriminator: 0)
192
+ !41 = !DIFile(filename: "standard.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language")
193
+ !42 = !DILocation(line: 43, column: 25, scope: !4)
194
+ !43 = !DILocation(line: 291, column: 36, scope: !40, inlinedAt: !42)
195
+ !44 = !DILocation(line: 50, column: 19, scope: !4)
196
+ !45 = !DILocation(line: 44, column: 39, scope: !4)
197
+ !46 = !DILocation(line: 44, column: 30, scope: !4)
198
+ !47 = !DILocation(line: 44, column: 87, scope: !4)
199
+ !48 = !DILocation(line: 47, column: 18, scope: !4)
200
+ !49 = !DILocation(line: 49, column: 19, scope: !4)
201
+ !50 = !DILocation(line: 51, column: 25, scope: !4)
202
+ !51 = !DILocation(line: 51, column: 37, scope: !4)
203
+ !52 = !DILocation(line: 51, column: 4, scope: !4)
progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ptx ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_red_fused_mul_0 // -- Begin function triton_red_fused_mul_0
10
+ .extern .shared .align 16 .b8 global_smem[];
11
+ // @triton_red_fused_mul_0
12
+ .visible .entry triton_red_fused_mul_0(
13
+ .param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_0,
14
+ .param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_1,
15
+ .param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_2,
16
+ .param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_3,
17
+ .param .u64 triton_red_fused_mul_0_param_4,
18
+ .param .u32 triton_red_fused_mul_0_param_5,
19
+ .param .u32 triton_red_fused_mul_0_param_6,
20
+ .param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_7,
21
+ .param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_8
22
+ )
23
+ .reqntid 128
24
+ {
25
+ .reg .pred %p<17>;
26
+ .reg .b16 %rs<9>;
27
+ .reg .b32 %r<65>;
28
+ .reg .b64 %rd<59>;
29
+ .loc 1 18 0 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:18:0
30
+ $L__func_begin0:
31
+ .loc 1 18 0 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:18:0
32
+
33
+ // %bb.0:
34
+ ld.param.b64 %rd15, [triton_red_fused_mul_0_param_4];
35
+ $L__tmp0:
36
+ .loc 1 22 28 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:22:28
37
+ mov.u32 %r7, %ctaid.x;
38
+ .loc 1 22 33 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:22:33
39
+ shl.b32 %r1, %r7, 2;
40
+ .loc 1 23 44 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:23:44
41
+ mov.u32 %r2, %tid.x;
42
+ bfe.u32 %r8, %r2, 5, 2;
43
+ and.b32 %r4, %r2, 3;
44
+ .loc 1 23 23 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:23:23
45
+ or.b32 %r9, %r8, %r1;
46
+ or.b32 %r10, %r1, %r4;
47
+ .loc 1 25 37 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:25:37
48
+ shl.b32 %r11, %r2, 2;
49
+ .loc 1 27 19 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:27:19
50
+ cvt.s64.s32 %rd1, %r9;
51
+ cvt.s64.s32 %rd2, %r10;
52
+ $L__tmp1:
53
+ .loc 2 72 16 // triton_helpers.py:72:16 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:28:51 ]
54
+ or.b64 %rd17, %rd1, %rd15;
55
+ and.b64 %rd18, %rd17, -4294967296;
56
+ setp.ne.b64 %p1, %rd18, 0;
57
+ cvt.u32.u64 %r64, %rd1;
58
+ @%p1 bra $L__BB0_2;
59
+ bra.uni $L__BB0_1;
60
+ $L__BB0_2:
61
+ div.s64 %rd57, %rd1, %rd15;
62
+ bra.uni $L__BB0_3;
63
+ $L__BB0_1:
64
+ cvt.u32.u64 %r12, %rd15;
65
+ div.u32 %r14, %r64, %r12;
66
+ cvt.u64.u32 %rd57, %r14;
67
+ $L__tmp2:
68
+ $L__BB0_3:
69
+ .loc 2 0 16 // triton_helpers.py:0:16
70
+ ld.param.b32 %r6, [triton_red_fused_mul_0_param_5];
71
+ ld.param.b64 %rd14, [triton_red_fused_mul_0_param_3];
72
+ ld.param.b64 %rd13, [triton_red_fused_mul_0_param_2];
73
+ ld.param.b64 %rd12, [triton_red_fused_mul_0_param_1];
74
+ ld.param.b64 %rd11, [triton_red_fused_mul_0_param_0];
75
+ and.b32 %r3, %r2, 96;
76
+ and.b32 %r5, %r11, 124;
77
+ .loc 1 27 19 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:27:19
78
+ mul.lo.s64 %rd19, %rd57, %rd15;
79
+ sub.s64 %rd7, %rd1, %rd19;
80
+ $L__tmp3:
81
+ .loc 2 72 16 // triton_helpers.py:72:16 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:28:51 ]
82
+ or.b64 %rd20, %rd2, %rd15;
83
+ and.b64 %rd21, %rd20, -4294967296;
84
+ setp.ne.b64 %p2, %rd21, 0;
85
+ @%p2 bra $L__BB0_5;
86
+ bra.uni $L__BB0_4;
87
+ $L__BB0_5:
88
+ div.s64 %rd58, %rd2, %rd15;
89
+ bra.uni $L__BB0_6;
90
+ $L__BB0_4:
91
+ cvt.u32.u64 %r15, %rd15;
92
+ cvt.u32.u64 %r16, %rd2;
93
+ div.u32 %r17, %r16, %r15;
94
+ cvt.u64.u32 %rd58, %r17;
95
+ $L__tmp4:
96
+ $L__BB0_6:
97
+ .loc 2 0 16 // triton_helpers.py:0:16
98
+ cvt.u32.u64 %r28, %rd2;
99
+ .loc 1 24 21 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:24:21
100
+ setp.lt.s32 %p5, %r28, %r6;
101
+ setp.lt.s32 %p3, %r64, %r6;
102
+ .loc 1 27 19 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:27:19
103
+ mul.lo.s64 %rd33, %rd58, %rd15;
104
+ sub.s64 %rd34, %rd2, %rd33;
105
+ $L__tmp5:
106
+ .loc 2 74 34 // triton_helpers.py:74:34 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:28:51 ]
107
+ setp.ne.b64 %p7, %rd7, 0;
108
+ setp.ne.b64 %p8, %rd34, 0;
109
+ .loc 2 75 25 // triton_helpers.py:75:25 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:28:51 ]
110
+ setp.lt.s32 %p9, %r1, 0;
111
+ .loc 2 75 36 // triton_helpers.py:75:36 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:28:51 ]
112
+ setp.lt.s64 %p10, %rd15, 0;
113
+ .loc 2 75 32 // triton_helpers.py:75:32 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:28:51 ]
114
+ xor.pred %p11, %p9, %p10;
115
+ .loc 2 75 47 // triton_helpers.py:75:47 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:28:51 ]
116
+ and.pred %p12, %p11, %p7;
117
+ selp.b64 %rd35, -1, 0, %p12;
118
+ add.s64 %rd36, %rd57, %rd35;
119
+ and.pred %p13, %p11, %p8;
120
+ selp.b64 %rd37, -1, 0, %p13;
121
+ add.s64 %rd38, %rd58, %rd37;
122
+ $L__tmp6:
123
+ .loc 1 37 34 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:37:34
124
+ shl.b64 %rd39, %rd36, 8;
125
+ add.s64 %rd40, %rd11, %rd39;
126
+ mul.wide.u32 %rd41, %r5, 2;
127
+ add.s64 %rd42, %rd40, %rd41;
128
+ shl.b64 %rd43, %rd7, 13;
129
+ add.s64 %rd23, %rd42, %rd43;
130
+ .loc 1 37 60 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:37:60
131
+ // begin inline asm
132
+ mov.u64 %rd24, 0x0;
133
+ createpolicy.fractional.L2::evict_first.b64 %rd24, 1.0;
134
+ // end inline asm
135
+ mov.b32 %r20, 0;
136
+ // begin inline asm
137
+ mov.u32 %r18, %r20;
138
+ mov.u32 %r19, %r20;
139
+ @%p3 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r18, %r19 }, [ %rd23 + 0 ], %rd24;
140
+ // end inline asm
141
+ .loc 1 38 73 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:38:73
142
+ setp.lt.s64 %p14, %rd15, 2;
143
+ .loc 1 38 99 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:38:99
144
+ setp.gt.s64 %p15, %rd15, 1;
145
+ .loc 1 38 90 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:38:90
146
+ selp.b64 %rd44, %rd15, 0, %p15;
147
+ .loc 1 38 0 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:38
148
+ selp.b64 %rd45, 1, 0, %p14;
149
+ .loc 1 38 81 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:38:81
150
+ add.s64 %rd46, %rd44, %rd45;
151
+ .loc 1 38 58 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:38:58
152
+ mul.lo.s64 %rd47, %rd36, %rd46;
153
+ .loc 1 38 34 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:38:34
154
+ shl.b64 %rd48, %rd7, 8;
155
+ add.s64 %rd49, %rd12, %rd48;
156
+ add.s64 %rd50, %rd49, %rd41;
157
+ shl.b64 %rd51, %rd47, 8;
158
+ add.s64 %rd26, %rd50, %rd51;
159
+ .loc 1 38 106 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:38:106
160
+ // begin inline asm
161
+ mov.u64 %rd27, 0x0;
162
+ createpolicy.fractional.L2::evict_first.b64 %rd27, 1.0;
163
+ // end inline asm
164
+ // begin inline asm
165
+ mov.u32 %r22, %r20;
166
+ mov.u32 %r23, %r20;
167
+ @%p3 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r22, %r23 }, [ %rd26 + 0 ], %rd27;
168
+ // end inline asm
169
+ .loc 1 37 122 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:37:122
170
+ mov.b32 {%rs1, %rs2}, %r18;
171
+ cvt.f32.bf16 %r30, %rs1;
172
+ cvt.f32.bf16 %r31, %rs2;
173
+ .loc 1 38 168 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:38:168
174
+ mov.b32 {%rs3, %rs4}, %r22;
175
+ cvt.f32.bf16 %r32, %rs3;
176
+ cvt.f32.bf16 %r33, %rs4;
177
+ .loc 1 41 23 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:41:23
178
+ fma.rn.f32 %r34, %r31, %r33, 0f00000000;
179
+ fma.rn.f32 %r35, %r30, %r32, 0f00000000;
180
+ .loc 1 37 122 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:37:122
181
+ mov.b32 {%rs5, %rs6}, %r19;
182
+ cvt.f32.bf16 %r36, %rs5;
183
+ cvt.f32.bf16 %r37, %rs6;
184
+ .loc 1 38 168 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:38:168
185
+ mov.b32 {%rs7, %rs8}, %r23;
186
+ cvt.f32.bf16 %r38, %rs7;
187
+ cvt.f32.bf16 %r39, %rs8;
188
+ .loc 1 41 23 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:41:23
189
+ fma.rn.f32 %r40, %r37, %r39, 0f00000000;
190
+ fma.rn.f32 %r41, %r36, %r38, 0f00000000;
191
+ $L__tmp7:
192
+ .loc 3 261 15 // standard.py:261:15 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:43:25 ]
193
+ add.f32 %r42, %r35, %r34;
194
+ add.f32 %r43, %r41, %r42;
195
+ add.f32 %r44, %r40, %r43;
196
+ selp.f32 %r45, %r44, 0f00000000, %p3;
197
+ .loc 3 291 36 // standard.py:291:36 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:43:25 ]
198
+ shfl.sync.bfly.b32 %r46, %r45, 16, 31, -1;
199
+ .loc 3 261 15 // standard.py:261:15 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:43:25 ]
200
+ add.f32 %r47, %r45, %r46;
201
+ .loc 3 291 36 // standard.py:291:36 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:43:25 ]
202
+ shfl.sync.bfly.b32 %r48, %r47, 8, 31, -1;
203
+ .loc 3 261 15 // standard.py:261:15 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:43:25 ]
204
+ add.f32 %r49, %r47, %r48;
205
+ .loc 3 291 36 // standard.py:291:36 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:43:25 ]
206
+ shfl.sync.bfly.b32 %r50, %r49, 4, 31, -1;
207
+ .loc 3 261 15 // standard.py:261:15 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:43:25 ]
208
+ add.f32 %r51, %r49, %r50;
209
+ .loc 3 291 36 // standard.py:291:36 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:43:25 ]
210
+ shfl.sync.bfly.b32 %r52, %r51, 2, 31, -1;
211
+ .loc 3 261 15 // standard.py:261:15 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:43:25 ]
212
+ add.f32 %r53, %r51, %r52;
213
+ .loc 3 291 36 // standard.py:291:36 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:43:25 ]
214
+ shfl.sync.bfly.b32 %r54, %r53, 1, 31, -1;
215
+ .loc 3 261 15 // standard.py:261:15 @[ c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:43:25 ]
216
+ add.f32 %r55, %r53, %r54;
217
+ $L__tmp8:
218
+ .loc 1 50 19 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:50:19
219
+ shr.u32 %r56, %r3, 3;
220
+ mov.b32 %r57, global_smem;
221
+ add.s32 %r58, %r57, %r56;
222
+ st.shared.b32 [%r58], %r55;
223
+ bar.sync 0;
224
+ shl.b32 %r59, %r4, 2;
225
+ add.s32 %r60, %r57, %r59;
226
+ ld.shared.b32 %r61, [%r60];
227
+ .loc 1 44 39 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:44:39
228
+ mul.lo.s64 %rd52, %rd38, %rd46;
229
+ .loc 1 44 30 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:44:30
230
+ shl.b64 %rd53, %rd34, 2;
231
+ add.s64 %rd54, %rd13, %rd53;
232
+ shl.b64 %rd55, %rd52, 2;
233
+ add.s64 %rd29, %rd54, %rd55;
234
+ .loc 1 44 87 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:44:87
235
+ // begin inline asm
236
+ mov.u64 %rd30, 0x0;
237
+ createpolicy.fractional.L2::evict_last.b64 %rd30, 1.0;
238
+ // end inline asm
239
+ // begin inline asm
240
+ mov.u32 %r26, 0x0;
241
+ @%p5 ld.global.L1::evict_last.L2::cache_hint.b32 { %r26 }, [ %rd29 + 0 ], %rd30;
242
+ // end inline asm
243
+ .loc 1 47 18 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:47:18
244
+ mul.f32 %r62, %r26, 0fBF317218;
245
+ .loc 1 50 19 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:50:19
246
+ fma.rn.f32 %r27, %r62, 0f3FB8AA3B, %r61;
247
+ .loc 1 51 25 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:51:25
248
+ shl.b64 %rd56, %rd2, 2;
249
+ add.s64 %rd31, %rd14, %rd56;
250
+ .loc 1 51 37 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:51:37
251
+ and.b32 %r63, %r2, 124;
252
+ setp.eq.b32 %p16, %r63, 0;
253
+ and.pred %p6, %p16, %p5;
254
+ // begin inline asm
255
+ @%p6 st.global.b32 [ %rd31 + 0 ], { %r27 };
256
+ // end inline asm
257
+ .loc 1 51 4 // c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py:51:4
258
+ ret;
259
+ $L__tmp9:
260
+ $L__func_end0:
261
+ // -- End function
262
+ }
263
+ .file 1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py"
264
+ .file 2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
265
+ .file 3 "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py"
266
+ .section .debug_abbrev
267
+ {
268
+ .b8 1 // Abbreviation Code
269
+ .b8 17 // DW_TAG_compile_unit
270
+ .b8 1 // DW_CHILDREN_yes
271
+ .b8 37 // DW_AT_producer
272
+ .b8 8 // DW_FORM_string
273
+ .b8 19 // DW_AT_language
274
+ .b8 5 // DW_FORM_data2
275
+ .b8 3 // DW_AT_name
276
+ .b8 8 // DW_FORM_string
277
+ .b8 16 // DW_AT_stmt_list
278
+ .b8 6 // DW_FORM_data4
279
+ .b8 27 // DW_AT_comp_dir
280
+ .b8 8 // DW_FORM_string
281
+ .b8 0 // EOM(1)
282
+ .b8 0 // EOM(2)
283
+ .b8 2 // Abbreviation Code
284
+ .b8 46 // DW_TAG_subprogram
285
+ .b8 0 // DW_CHILDREN_no
286
+ .b8 3 // DW_AT_name
287
+ .b8 8 // DW_FORM_string
288
+ .b8 32 // DW_AT_inline
289
+ .b8 11 // DW_FORM_data1
290
+ .b8 0 // EOM(1)
291
+ .b8 0 // EOM(2)
292
+ .b8 3 // Abbreviation Code
293
+ .b8 46 // DW_TAG_subprogram
294
+ .b8 1 // DW_CHILDREN_yes
295
+ .b8 17 // DW_AT_low_pc
296
+ .b8 1 // DW_FORM_addr
297
+ .b8 18 // DW_AT_high_pc
298
+ .b8 1 // DW_FORM_addr
299
+ .b8 49 // DW_AT_abstract_origin
300
+ .b8 19 // DW_FORM_ref4
301
+ .b8 0 // EOM(1)
302
+ .b8 0 // EOM(2)
303
+ .b8 4 // Abbreviation Code
304
+ .b8 29 // DW_TAG_inlined_subroutine
305
+ .b8 0 // DW_CHILDREN_no
306
+ .b8 49 // DW_AT_abstract_origin
307
+ .b8 19 // DW_FORM_ref4
308
+ .b8 17 // DW_AT_low_pc
309
+ .b8 1 // DW_FORM_addr
310
+ .b8 18 // DW_AT_high_pc
311
+ .b8 1 // DW_FORM_addr
312
+ .b8 88 // DW_AT_call_file
313
+ .b8 11 // DW_FORM_data1
314
+ .b8 89 // DW_AT_call_line
315
+ .b8 11 // DW_FORM_data1
316
+ .b8 87 // DW_AT_call_column
317
+ .b8 11 // DW_FORM_data1
318
+ .b8 0 // EOM(1)
319
+ .b8 0 // EOM(2)
320
+ .b8 0 // EOM(3)
321
+ }
322
+ .section .debug_info
323
+ {
324
+ .b32 235 // Length of Unit
325
+ .b8 2 // DWARF version number
326
+ .b8 0
327
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
328
+ .b8 8 // Address Size (in bytes)
329
+ .b8 1 // Abbrev [1] 0xb:0xe4 DW_TAG_compile_unit
330
+ .b8 116 // DW_AT_producer
331
+ .b8 114
332
+ .b8 105
333
+ .b8 116
334
+ .b8 111
335
+ .b8 110
336
+ .b8 0
337
+ .b8 2 // DW_AT_language
338
+ .b8 0
339
+ .b8 99 // DW_AT_name
340
+ .b8 51
341
+ .b8 113
342
+ .b8 98
343
+ .b8 118
344
+ .b8 99
345
+ .b8 115
346
+ .b8 120
347
+ .b8 50
348
+ .b8 119
349
+ .b8 55
350
+ .b8 113
351
+ .b8 115
352
+ .b8 115
353
+ .b8 50
354
+ .b8 118
355
+ .b8 51
356
+ .b8 101
357
+ .b8 111
358
+ .b8 99
359
+ .b8 117
360
+ .b8 97
361
+ .b8 100
362
+ .b8 103
363
+ .b8 122
364
+ .b8 54
365
+ .b8 116
366
+ .b8 51
367
+ .b8 53
368
+ .b8 106
369
+ .b8 111
370
+ .b8 111
371
+ .b8 51
372
+ .b8 51
373
+ .b8 98
374
+ .b8 102
375
+ .b8 108
376
+ .b8 122
377
+ .b8 113
378
+ .b8 107
379
+ .b8 120
380
+ .b8 122
381
+ .b8 122
382
+ .b8 106
383
+ .b8 55
384
+ .b8 52
385
+ .b8 55
386
+ .b8 122
387
+ .b8 99
388
+ .b8 106
389
+ .b8 112
390
+ .b8 107
391
+ .b8 46
392
+ .b8 112
393
+ .b8 121
394
+ .b8 0
395
+ .b32 .debug_line // DW_AT_stmt_list
396
+ .b8 47 // DW_AT_comp_dir
397
+ .b8 119
398
+ .b8 111
399
+ .b8 114
400
+ .b8 107
401
+ .b8 115
402
+ .b8 112
403
+ .b8 97
404
+ .b8 99
405
+ .b8 101
406
+ .b8 47
407
+ .b8 104
408
+ .b8 97
409
+ .b8 110
410
+ .b8 114
411
+ .b8 117
412
+ .b8 105
413
+ .b8 47
414
+ .b8 106
415
+ .b8 117
416
+ .b8 110
417
+ .b8 113
418
+ .b8 117
419
+ .b8 97
420
+ .b8 110
421
+ .b8 47
422
+ .b8 83
423
+ .b8 112
424
+ .b8 101
425
+ .b8 99
426
+ .b8 70
427
+ .b8 111
428
+ .b8 114
429
+ .b8 103
430
+ .b8 101
431
+ .b8 47
432
+ .b8 99
433
+ .b8 97
434
+ .b8 99
435
+ .b8 104
436
+ .b8 101
437
+ .b8 47
438
+ .b8 99
439
+ .b8 111
440
+ .b8 109
441
+ .b8 112
442
+ .b8 105
443
+ .b8 108
444
+ .b8 101
445
+ .b8 100
446
+ .b8 95
447
+ .b8 107
448
+ .b8 101
449
+ .b8 114
450
+ .b8 110
451
+ .b8 101
452
+ .b8 108
453
+ .b8 115
454
+ .b8 47
455
+ .b8 51
456
+ .b8 113
457
+ .b8 0
458
+ .b8 2 // Abbrev [2] 0x8f:0x19 DW_TAG_subprogram
459
+ .b8 116 // DW_AT_name
460
+ .b8 114
461
+ .b8 105
462
+ .b8 116
463
+ .b8 111
464
+ .b8 110
465
+ .b8 95
466
+ .b8 114
467
+ .b8 101
468
+ .b8 100
469
+ .b8 95
470
+ .b8 102
471
+ .b8 117
472
+ .b8 115
473
+ .b8 101
474
+ .b8 100
475
+ .b8 95
476
+ .b8 109
477
+ .b8 117
478
+ .b8 108
479
+ .b8 95
480
+ .b8 48
481
+ .b8 0
482
+ .b8 1 // DW_AT_inline
483
+ .b8 3 // Abbrev [3] 0xa8:0x46 DW_TAG_subprogram
484
+ .b64 $L__func_begin0 // DW_AT_low_pc
485
+ .b64 $L__func_end0 // DW_AT_high_pc
486
+ .b32 143 // DW_AT_abstract_origin
487
+ .b8 4 // Abbrev [4] 0xbd:0x18 DW_TAG_inlined_subroutine
488
+ .b32 143 // DW_AT_abstract_origin
489
+ .b64 $L__tmp1 // DW_AT_low_pc
490
+ .b64 $L__tmp6 // DW_AT_high_pc
491
+ .b8 1 // DW_AT_call_file
492
+ .b8 28 // DW_AT_call_line
493
+ .b8 51 // DW_AT_call_column
494
+ .b8 4 // Abbrev [4] 0xd5:0x18 DW_TAG_inlined_subroutine
495
+ .b32 143 // DW_AT_abstract_origin
496
+ .b64 $L__tmp7 // DW_AT_low_pc
497
+ .b64 $L__tmp8 // DW_AT_high_pc
498
+ .b8 1 // DW_AT_call_file
499
+ .b8 43 // DW_AT_call_line
500
+ .b8 25 // DW_AT_call_column
501
+ .b8 0 // End Of Children Mark
502
+ .b8 0 // End Of Children Mark
503
+ }
504
+ .section .debug_macinfo { }
progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.source ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":18:0)
2
+ #loc62 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0)
3
+ #loc74 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
4
+ #loc76 = loc(unknown)
5
+ #loc79 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
6
+ #loc83 = loc("in_ptr0"(#loc))
7
+ #loc84 = loc("in_ptr1"(#loc))
8
+ #loc85 = loc("in_ptr2"(#loc))
9
+ #loc86 = loc("out_ptr1"(#loc))
10
+ #loc87 = loc("ks0"(#loc))
11
+ #loc88 = loc("xnumel"(#loc))
12
+ #loc89 = loc("r0_numel"(#loc))
13
+ #loc147 = loc("a"(#loc62))
14
+ #loc148 = loc("b"(#loc62))
15
+ #loc154 = loc("input"(#loc74))
16
+ #loc155 = loc("a"(#loc79))
17
+ #loc156 = loc("b"(#loc79))
18
+ module {
19
+ tt.func public @triton_red_fused_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
20
+ %r0_numel_0 = arith.constant 128 : i32 loc(#loc90)
21
+ %xoffset = tt.get_program_id x : i32 loc(#loc91)
22
+ %xoffset_1 = arith.constant 4 : i32 loc(#loc92)
23
+ %xoffset_2 = arith.constant 4 : i32 loc(#loc92)
24
+ %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc92)
25
+ %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc93)
26
+ %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc94)
27
+ %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<4x1xi32> loc(#loc95)
28
+ %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<4x1xi32> loc(#loc95)
29
+ %xmask = tt.splat %xnumel : i32 -> tensor<4x1xi32> loc(#loc96)
30
+ %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<4x1xi32> loc(#loc96)
31
+ %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc97)
32
+ %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc98)
33
+ %x0 = arith.extsi %xindex_6 : tensor<4x1xi32> to tensor<4x1xi64> loc(#loc99)
34
+ %x0_9 = tt.splat %ks0 : i64 -> tensor<4x1xi64> loc(#loc99)
35
+ %x0_10 = arith.remsi %x0, %x0_9 : tensor<4x1xi64> loc(#loc99)
36
+ %x1 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S4_1S_i64__(%xindex_6, %ks0) : (tensor<4x1xi32>, i64) -> tensor<4x1xi64> loc(#loc100)
37
+ %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc101)
38
+ %_tmp4_11 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc101)
39
+ %c0_i32 = arith.constant 0 : i32 loc(#loc13)
40
+ %c128_i32 = arith.constant 128 : i32 loc(#loc13)
41
+ %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc13)
42
+ %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc13)
43
+ %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc13)
44
+ %3 = ub.poison : i32 loc(#loc13)
45
+ %_tmp4_12 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_35 = %_tmp4_11) -> (tensor<4x128xf32>) : i32 {
46
+ %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc103)
47
+ %r0_index_36 = arith.addi %r0_index, %r0_base_8 : tensor<1x128xi32> loc(#loc103)
48
+ %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc104)
49
+ %r0_mask_37 = arith.cmpi slt, %r0_index_36, %r0_mask : tensor<1x128xi32> loc(#loc104)
50
+ %tmp0 = arith.constant 128 : i32 loc(#loc105)
51
+ %tmp0_38 = arith.constant 128 : i64 loc(#loc105)
52
+ %tmp0_39 = arith.constant dense<128> : tensor<4x1xi64> loc(#loc105)
53
+ %tmp0_40 = arith.muli %tmp0_39, %x1 : tensor<4x1xi64> loc(#loc105)
54
+ %tmp0_41 = arith.extsi %r0_index_36 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc106)
55
+ %tmp0_42 = tt.broadcast %tmp0_41 : tensor<1x128xi64> -> tensor<4x128xi64> loc(#loc106)
56
+ %tmp0_43 = tt.broadcast %tmp0_40 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc106)
57
+ %tmp0_44 = arith.addi %tmp0_42, %tmp0_43 : tensor<4x128xi64> loc(#loc106)
58
+ %tmp0_45 = arith.constant 4096 : i32 loc(#loc107)
59
+ %tmp0_46 = arith.constant 4096 : i64 loc(#loc107)
60
+ %tmp0_47 = arith.constant dense<4096> : tensor<4x1xi64> loc(#loc107)
61
+ %tmp0_48 = arith.muli %tmp0_47, %x0_10 : tensor<4x1xi64> loc(#loc107)
62
+ %tmp0_49 = tt.broadcast %tmp0_48 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc108)
63
+ %tmp0_50 = arith.addi %tmp0_44, %tmp0_49 : tensor<4x128xi64> loc(#loc108)
64
+ %tmp0_51 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc109)
65
+ %tmp0_52 = tt.addptr %tmp0_51, %tmp0_50 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi64> loc(#loc109)
66
+ %tmp0_53 = tt.broadcast %r0_mask_37 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc110)
67
+ %tmp0_54 = tt.broadcast %xmask_7 : tensor<4x1xi1> -> tensor<4x128xi1> loc(#loc110)
68
+ %tmp0_55 = arith.andi %tmp0_53, %tmp0_54 : tensor<4x128xi1> loc(#loc110)
69
+ %tmp0_56 = arith.constant 0.000000e+00 : f32 loc(#loc111)
70
+ %tmp0_57 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc111)
71
+ %tmp0_58 = arith.truncf %tmp0_57 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc111)
72
+ %tmp0_59 = tt.load %tmp0_52, %tmp0_55, %tmp0_58 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>> loc(#loc111)
73
+ %tmp0_60 = arith.extf %tmp0_59 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc112)
74
+ %tmp1 = arith.constant 128 : i32 loc(#loc113)
75
+ %tmp1_61 = arith.constant 128 : i64 loc(#loc113)
76
+ %tmp1_62 = arith.constant dense<128> : tensor<4x1xi64> loc(#loc113)
77
+ %tmp1_63 = arith.muli %tmp1_62, %x0_10 : tensor<4x1xi64> loc(#loc113)
78
+ %tmp1_64 = arith.extsi %r0_index_36 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc114)
79
+ %tmp1_65 = tt.broadcast %tmp1_64 : tensor<1x128xi64> -> tensor<4x128xi64> loc(#loc114)
80
+ %tmp1_66 = tt.broadcast %tmp1_63 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc114)
81
+ %tmp1_67 = arith.addi %tmp1_65, %tmp1_66 : tensor<4x128xi64> loc(#loc114)
82
+ %tmp1_68 = arith.constant 128 : i32 loc(#loc115)
83
+ %tmp1_69 = arith.constant 128 : i64 loc(#loc115)
84
+ %tmp1_70 = arith.constant dense<128> : tensor<4x1xi64> loc(#loc115)
85
+ %tmp1_71 = arith.muli %tmp1_70, %x1 : tensor<4x1xi64> loc(#loc115)
86
+ %tmp1_72 = arith.constant 1 : i32 loc(#loc116)
87
+ %tmp1_73 = arith.extsi %tmp1_72 : i32 to i64 loc(#loc116)
88
+ %tmp1_74 = arith.cmpi sge, %tmp1_73, %ks0 : i64 loc(#loc116)
89
+ %tmp1_75 = arith.constant 1 : i32 loc(#loc117)
90
+ %tmp1_76 = arith.constant 1 : i32 loc(#loc117)
91
+ %tmp1_77 = arith.extui %tmp1_74 : i1 to i32 loc(#loc117)
92
+ %tmp1_78 = arith.muli %tmp1_76, %tmp1_77 : i32 loc(#loc117)
93
+ %tmp1_79 = arith.constant 1 : i32 loc(#loc118)
94
+ %tmp1_80 = arith.extsi %tmp1_79 : i32 to i64 loc(#loc118)
95
+ %tmp1_81 = arith.cmpi sgt, %ks0, %tmp1_80 : i64 loc(#loc118)
96
+ %tmp1_82 = arith.extui %tmp1_81 : i1 to i64 loc(#loc119)
97
+ %tmp1_83 = arith.muli %ks0, %tmp1_82 : i64 loc(#loc119)
98
+ %tmp1_84 = arith.extsi %tmp1_78 : i32 to i64 loc(#loc120)
99
+ %tmp1_85 = arith.addi %tmp1_84, %tmp1_83 : i64 loc(#loc120)
100
+ %tmp1_86 = tt.splat %tmp1_85 : i64 -> tensor<4x1xi64> loc(#loc121)
101
+ %tmp1_87 = arith.muli %tmp1_71, %tmp1_86 : tensor<4x1xi64> loc(#loc121)
102
+ %tmp1_88 = tt.broadcast %tmp1_87 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc122)
103
+ %tmp1_89 = arith.addi %tmp1_67, %tmp1_88 : tensor<4x128xi64> loc(#loc122)
104
+ %tmp1_90 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc123)
105
+ %tmp1_91 = tt.addptr %tmp1_90, %tmp1_89 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi64> loc(#loc123)
106
+ %tmp1_92 = tt.broadcast %r0_mask_37 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc124)
107
+ %tmp1_93 = tt.broadcast %xmask_7 : tensor<4x1xi1> -> tensor<4x128xi1> loc(#loc124)
108
+ %tmp1_94 = arith.andi %tmp1_92, %tmp1_93 : tensor<4x128xi1> loc(#loc124)
109
+ %tmp1_95 = arith.constant 0.000000e+00 : f32 loc(#loc125)
110
+ %tmp1_96 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc125)
111
+ %tmp1_97 = arith.truncf %tmp1_96 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc125)
112
+ %tmp1_98 = tt.load %tmp1_91, %tmp1_94, %tmp1_97 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>> loc(#loc125)
113
+ %tmp1_99 = arith.extf %tmp1_98 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc126)
114
+ %tmp2 = arith.mulf %tmp0_60, %tmp1_99 : tensor<4x128xf32> loc(#loc127)
115
+ %tmp5 = arith.addf %_tmp4_35, %tmp2 : tensor<4x128xf32> loc(#loc128)
116
+ %_tmp4_100 = tt.broadcast %r0_mask_37 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc129)
117
+ %_tmp4_101 = tt.broadcast %xmask_7 : tensor<4x1xi1> -> tensor<4x128xi1> loc(#loc129)
118
+ %_tmp4_102 = arith.andi %_tmp4_100, %_tmp4_101 : tensor<4x128xi1> loc(#loc129)
119
+ %_tmp4_103 = arith.select %_tmp4_102, %tmp5, %_tmp4_35 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc130)
120
+ scf.yield %_tmp4_103 : tensor<4x128xf32> loc(#loc42)
121
+ } loc(#loc102)
122
+ %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_12) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc131)
123
+ %tmp4_13 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc132)
124
+ %tmp7 = arith.constant 1 : i32 loc(#loc133)
125
+ %tmp7_14 = arith.extsi %tmp7 : i32 to i64 loc(#loc133)
126
+ %tmp7_15 = arith.cmpi sge, %tmp7_14, %ks0 : i64 loc(#loc133)
127
+ %tmp7_16 = arith.constant 1 : i32 loc(#loc134)
128
+ %tmp7_17 = arith.constant 1 : i32 loc(#loc134)
129
+ %tmp7_18 = arith.extui %tmp7_15 : i1 to i32 loc(#loc134)
130
+ %tmp7_19 = arith.muli %tmp7_17, %tmp7_18 : i32 loc(#loc134)
131
+ %tmp7_20 = arith.constant 1 : i32 loc(#loc135)
132
+ %tmp7_21 = arith.extsi %tmp7_20 : i32 to i64 loc(#loc135)
133
+ %tmp7_22 = arith.cmpi sgt, %ks0, %tmp7_21 : i64 loc(#loc135)
134
+ %tmp7_23 = arith.extui %tmp7_22 : i1 to i64 loc(#loc136)
135
+ %tmp7_24 = arith.muli %ks0, %tmp7_23 : i64 loc(#loc136)
136
+ %tmp7_25 = arith.extsi %tmp7_19 : i32 to i64 loc(#loc137)
137
+ %tmp7_26 = arith.addi %tmp7_25, %tmp7_24 : i64 loc(#loc137)
138
+ %tmp7_27 = tt.splat %tmp7_26 : i64 -> tensor<4x1xi64> loc(#loc138)
139
+ %tmp7_28 = arith.muli %x1, %tmp7_27 : tensor<4x1xi64> loc(#loc138)
140
+ %tmp7_29 = arith.addi %x0_10, %tmp7_28 : tensor<4x1xi64> loc(#loc139)
141
+ %tmp7_30 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>> loc(#loc140)
142
+ %tmp7_31 = tt.addptr %tmp7_30, %tmp7_29 : tensor<4x1x!tt.ptr<f32>>, tensor<4x1xi64> loc(#loc140)
143
+ %tmp7_32 = tt.load %tmp7_31, %xmask_7 evictionPolicy = evict_last : tensor<4x1x!tt.ptr<f32>> loc(#loc141)
144
+ %tmp8 = arith.constant 0.693147182 : f32 loc(#loc142)
145
+ %tmp9 = arith.constant dense<0.693147182> : tensor<4x1xf32> loc(#loc143)
146
+ %tmp9_33 = arith.mulf %tmp7_32, %tmp9 : tensor<4x1xf32> loc(#loc143)
147
+ %tmp10 = arith.constant 1.44269502 : f32 loc(#loc144)
148
+ %tmp11 = arith.constant dense<1.44269502> : tensor<4x1xf32> loc(#loc145)
149
+ %tmp11_34 = arith.mulf %tmp9_33, %tmp11 : tensor<4x1xf32> loc(#loc145)
150
+ %tmp12 = arith.subf %tmp4_13, %tmp11_34 : tensor<4x1xf32> loc(#loc146)
151
+ %4 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>> loc(#loc59)
152
+ %5 = tt.addptr %4, %xindex_6 : tensor<4x1x!tt.ptr<f32>>, tensor<4x1xi32> loc(#loc59)
153
+ tt.store %5, %tmp12, %xmask_7 : tensor<4x1x!tt.ptr<f32>> loc(#loc60)
154
+ tt.return loc(#loc61)
155
+ } loc(#loc)
156
+ tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S4_1S_i64__(%a: tensor<4x1xi32> loc("a"(#loc62)), %b: i64 loc("b"(#loc62))) -> tensor<4x1xi64> attributes {noinline = false} {
157
+ %quot = arith.extsi %a : tensor<4x1xi32> to tensor<4x1xi64> loc(#loc149)
158
+ %quot_0 = tt.splat %b : i64 -> tensor<4x1xi64> loc(#loc149)
159
+ %quot_1 = arith.divsi %quot, %quot_0 : tensor<4x1xi64> loc(#loc149)
160
+ %remainder = arith.extsi %a : tensor<4x1xi32> to tensor<4x1xi64> loc(#loc150)
161
+ %remainder_2 = tt.splat %b : i64 -> tensor<4x1xi64> loc(#loc150)
162
+ %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<4x1xi64> loc(#loc150)
163
+ %fixed = arith.constant 0 : i32 loc(#loc151)
164
+ %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc151)
165
+ %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<4x1xi64> loc(#loc151)
166
+ %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<4x1xi64> loc(#loc151)
167
+ %fixed_7 = arith.constant 1 : i32 loc(#loc152)
168
+ %fixed_8 = arith.constant 1 : i64 loc(#loc152)
169
+ %fixed_9 = arith.constant dense<1> : tensor<4x1xi64> loc(#loc152)
170
+ %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<4x1xi64> loc(#loc152)
171
+ %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<4x1xi1>, tensor<4x1xi64> loc(#loc153)
172
+ %c0_i32 = arith.constant 0 : i32 loc(#loc68)
173
+ %cst = arith.constant dense<0> : tensor<4x1xi32> loc(#loc68)
174
+ %0 = arith.cmpi slt, %a, %cst : tensor<4x1xi32> loc(#loc68)
175
+ %c0_i32_12 = arith.constant 0 : i32 loc(#loc69)
176
+ %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc69)
177
+ %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc69)
178
+ %3 = tt.splat %2 : i1 -> tensor<4x1xi1> loc(#loc70)
179
+ %4 = arith.cmpi ne, %0, %3 : tensor<4x1xi1> loc(#loc70)
180
+ %5 = arith.select %4, %fixed_11, %quot_1 : tensor<4x1xi1>, tensor<4x1xi64> loc(#loc71)
181
+ tt.return %5 : tensor<4x1xi64> loc(#loc72)
182
+ ^bb1: // no predecessors
183
+ %6 = ub.poison : tensor<4x1xi64> loc(#loc73)
184
+ tt.return %6 : tensor<4x1xi64> loc(#loc73)
185
+ } loc(#loc62)
186
+ tt.func private @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x128xf32> loc("input"(#loc74))) -> tensor<4xf32> attributes {noinline = false} {
187
+ %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
188
+ ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
189
+ %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc75)
190
+ tt.reduce.return %2 : f32 loc(#loc75)
191
+ }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc75)
192
+ tt.return %0 : tensor<4xf32> loc(#loc77)
193
+ ^bb1: // no predecessors
194
+ %1 = ub.poison : tensor<4xf32> loc(#loc78)
195
+ tt.return %1 : tensor<4xf32> loc(#loc78)
196
+ } loc(#loc74)
197
+ tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc79)), %b: f32 loc("b"(#loc79))) -> f32 attributes {noinline = false} {
198
+ %0 = arith.addf %a, %b : f32 loc(#loc80)
199
+ tt.return %0 : f32 loc(#loc81)
200
+ ^bb1: // no predecessors
201
+ %1 = ub.poison : f32 loc(#loc82)
202
+ tt.return %1 : f32 loc(#loc82)
203
+ } loc(#loc79)
204
+ } loc(#loc)
205
+ #loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":19:15)
206
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":22:28)
207
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":22:33)
208
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":23:36)
209
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":23:44)
210
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":23:23)
211
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":24:21)
212
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":25:27)
213
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":25:37)
214
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":27:19)
215
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":28:51)
216
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":29:43)
217
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":31:40)
218
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":32:31)
219
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":33:29)
220
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:45)
221
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:41)
222
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:55)
223
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:50)
224
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:34)
225
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:70)
226
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:60)
227
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:122)
228
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:45)
229
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:41)
230
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:54)
231
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:73)
232
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:65)
233
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:99)
234
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:90)
235
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:81)
236
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:58)
237
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:50)
238
+ #loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:34)
239
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:116)
240
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:106)
241
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:168)
242
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":39:22)
243
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":41:23)
244
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":42:35)
245
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":42:48)
246
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":42:8)
247
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":43:25)
248
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":43:28)
249
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:54)
250
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:46)
251
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:80)
252
+ #loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:71)
253
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:62)
254
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:39)
255
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:35)
256
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:30)
257
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:87)
258
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":46:11)
259
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":47:18)
260
+ #loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":48:12)
261
+ #loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":49:19)
262
+ #loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":50:19)
263
+ #loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":51:25)
264
+ #loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":51:37)
265
+ #loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":51:4)
266
+ #loc63 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
267
+ #loc64 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20)
268
+ #loc65 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
269
+ #loc66 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
270
+ #loc67 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
271
+ #loc68 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
272
+ #loc69 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
273
+ #loc70 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
274
+ #loc71 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
275
+ #loc72 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11)
276
+ #loc73 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4)
277
+ #loc75 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
278
+ #loc77 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
279
+ #loc78 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
280
+ #loc80 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
281
+ #loc81 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
282
+ #loc82 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
283
+ #loc90 = loc("r0_numel"(#loc1))
284
+ #loc91 = loc("xoffset"(#loc2))
285
+ #loc92 = loc("xoffset"(#loc3))
286
+ #loc93 = loc("xindex"(#loc4))
287
+ #loc94 = loc("xindex"(#loc5))
288
+ #loc95 = loc("xindex"(#loc6))
289
+ #loc96 = loc("xmask"(#loc7))
290
+ #loc97 = loc("r0_base"(#loc8))
291
+ #loc98 = loc("r0_base"(#loc9))
292
+ #loc99 = loc("x0"(#loc10))
293
+ #loc100 = loc("x1"(#loc11))
294
+ #loc101 = loc("_tmp4"(#loc12))
295
+ #loc102 = loc("_tmp4"(#loc13))
296
+ #loc103 = loc("r0_index"(#loc14))
297
+ #loc104 = loc("r0_mask"(#loc15))
298
+ #loc105 = loc("tmp0"(#loc16))
299
+ #loc106 = loc("tmp0"(#loc17))
300
+ #loc107 = loc("tmp0"(#loc18))
301
+ #loc108 = loc("tmp0"(#loc19))
302
+ #loc109 = loc("tmp0"(#loc20))
303
+ #loc110 = loc("tmp0"(#loc21))
304
+ #loc111 = loc("tmp0"(#loc22))
305
+ #loc112 = loc("tmp0"(#loc23))
306
+ #loc113 = loc("tmp1"(#loc24))
307
+ #loc114 = loc("tmp1"(#loc25))
308
+ #loc115 = loc("tmp1"(#loc26))
309
+ #loc116 = loc("tmp1"(#loc27))
310
+ #loc117 = loc("tmp1"(#loc28))
311
+ #loc118 = loc("tmp1"(#loc29))
312
+ #loc119 = loc("tmp1"(#loc30))
313
+ #loc120 = loc("tmp1"(#loc31))
314
+ #loc121 = loc("tmp1"(#loc32))
315
+ #loc122 = loc("tmp1"(#loc33))
316
+ #loc123 = loc("tmp1"(#loc34))
317
+ #loc124 = loc("tmp1"(#loc35))
318
+ #loc125 = loc("tmp1"(#loc36))
319
+ #loc126 = loc("tmp1"(#loc37))
320
+ #loc127 = loc("tmp2"(#loc38))
321
+ #loc128 = loc("tmp5"(#loc39))
322
+ #loc129 = loc("_tmp4"(#loc40))
323
+ #loc130 = loc("_tmp4"(#loc41))
324
+ #loc131 = loc("tmp4"(#loc43))
325
+ #loc132 = loc("tmp4"(#loc44))
326
+ #loc133 = loc("tmp7"(#loc45))
327
+ #loc134 = loc("tmp7"(#loc46))
328
+ #loc135 = loc("tmp7"(#loc47))
329
+ #loc136 = loc("tmp7"(#loc48))
330
+ #loc137 = loc("tmp7"(#loc49))
331
+ #loc138 = loc("tmp7"(#loc50))
332
+ #loc139 = loc("tmp7"(#loc51))
333
+ #loc140 = loc("tmp7"(#loc52))
334
+ #loc141 = loc("tmp7"(#loc53))
335
+ #loc142 = loc("tmp8"(#loc54))
336
+ #loc143 = loc("tmp9"(#loc55))
337
+ #loc144 = loc("tmp10"(#loc56))
338
+ #loc145 = loc("tmp11"(#loc57))
339
+ #loc146 = loc("tmp12"(#loc58))
340
+ #loc149 = loc("quot"(#loc63))
341
+ #loc150 = loc("remainder"(#loc64))
342
+ #loc151 = loc("fixed"(#loc65))
343
+ #loc152 = loc("fixed"(#loc66))
344
+ #loc153 = loc("fixed"(#loc67))
progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ttgir ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
2
+ #blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
3
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":18:0)
4
+ #loc1 = loc(unknown)
5
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":43:25)
6
+ #loc56 = loc("in_ptr0"(#loc))
7
+ #loc57 = loc("in_ptr1"(#loc))
8
+ #loc58 = loc("in_ptr2"(#loc))
9
+ #loc59 = loc("out_ptr1"(#loc))
10
+ #loc60 = loc("ks0"(#loc))
11
+ #loc61 = loc("xnumel"(#loc))
12
+ #loc62 = loc("r0_numel"(#loc))
13
+ #loc99 = loc("tmp4"(#loc43))
14
+ #loc118 = loc(callsite(#loc1 at #loc99))
15
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
16
+ tt.func public @triton_red_fused_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
17
+ %cst = arith.constant dense<0.693147182> : tensor<4x1xf32, #blocked> loc(#loc1)
18
+ %cst_0 = arith.constant dense<1.44269502> : tensor<4x1xf32, #blocked> loc(#loc1)
19
+ %cst_1 = arith.constant dense<0> : tensor<4x1xi64, #blocked> loc(#loc1)
20
+ %cst_2 = arith.constant dense<0> : tensor<4x1xi32, #blocked> loc(#loc1)
21
+ %cst_3 = arith.constant dense<1> : tensor<4x1xi64, #blocked> loc(#loc1)
22
+ %cst_4 = arith.constant dense<1> : tensor<4x1xi64, #blocked1> loc(#loc1)
23
+ %c4_i32 = arith.constant 4 : i32 loc(#loc1)
24
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked1> loc(#loc1)
25
+ %c0_i64 = arith.constant 0 : i64 loc(#loc1)
26
+ %c1_i64 = arith.constant 1 : i64 loc(#loc1)
27
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked1> loc(#loc1)
28
+ %cst_7 = arith.constant dense<0> : tensor<4x1xi32, #blocked1> loc(#loc1)
29
+ %cst_8 = arith.constant dense<128> : tensor<1x128xi32, #blocked1> loc(#loc1)
30
+ %cst_9 = arith.constant dense<128> : tensor<4x1xi64, #blocked1> loc(#loc1)
31
+ %cst_10 = arith.constant dense<4096> : tensor<4x1xi64, #blocked1> loc(#loc1)
32
+ %cst_11 = arith.constant dense<0> : tensor<4x1xi64, #blocked1> loc(#loc1)
33
+ %xoffset = tt.get_program_id x : i32 loc(#loc63)
34
+ %xoffset_12 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc64)
35
+ %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc65)
36
+ %xindex_13 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc65)
37
+ %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc65)
38
+ %xindex_15 = tt.expand_dims %xindex_13 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc65)
39
+ %xindex_16 = tt.splat %xoffset_12 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc66)
40
+ %xindex_17 = tt.splat %xoffset_12 : i32 -> tensor<4x1xi32, #blocked> loc(#loc66)
41
+ %xindex_18 = arith.addi %xindex_16, %xindex_14 : tensor<4x1xi32, #blocked1> loc(#loc66)
42
+ %xindex_19 = arith.addi %xindex_17, %xindex_15 : tensor<4x1xi32, #blocked> loc(#loc66)
43
+ %xmask = tt.splat %xnumel : i32 -> tensor<4x1xi32, #blocked1> loc(#loc67)
44
+ %xmask_20 = tt.splat %xnumel : i32 -> tensor<4x1xi32, #blocked> loc(#loc67)
45
+ %xmask_21 = arith.cmpi slt, %xindex_18, %xmask : tensor<4x1xi32, #blocked1> loc(#loc67)
46
+ %xmask_22 = arith.cmpi slt, %xindex_19, %xmask_20 : tensor<4x1xi32, #blocked> loc(#loc67)
47
+ %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc68)
48
+ %r0_base_23 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc68)
49
+ %x0 = arith.extsi %xindex_18 : tensor<4x1xi32, #blocked1> to tensor<4x1xi64, #blocked1> loc(#loc69)
50
+ %x0_24 = arith.extsi %xindex_19 : tensor<4x1xi32, #blocked> to tensor<4x1xi64, #blocked> loc(#loc69)
51
+ %x0_25 = tt.splat %ks0 : i64 -> tensor<4x1xi64, #blocked1> loc(#loc69)
52
+ %x0_26 = tt.splat %ks0 : i64 -> tensor<4x1xi64, #blocked> loc(#loc69)
53
+ %x0_27 = arith.remsi %x0, %x0_25 : tensor<4x1xi64, #blocked1> loc(#loc69)
54
+ %x0_28 = arith.remsi %x0_24, %x0_26 : tensor<4x1xi64, #blocked> loc(#loc69)
55
+ %quot = arith.divsi %x0, %x0_25 : tensor<4x1xi64, #blocked1> loc(#loc108)
56
+ %quot_29 = arith.divsi %x0_24, %x0_26 : tensor<4x1xi64, #blocked> loc(#loc108)
57
+ %fixed = arith.cmpi ne, %x0_27, %cst_11 : tensor<4x1xi64, #blocked1> loc(#loc109)
58
+ %fixed_30 = arith.cmpi ne, %x0_28, %cst_1 : tensor<4x1xi64, #blocked> loc(#loc109)
59
+ %fixed_31 = arith.subi %quot, %cst_4 : tensor<4x1xi64, #blocked1> loc(#loc110)
60
+ %fixed_32 = arith.subi %quot_29, %cst_3 : tensor<4x1xi64, #blocked> loc(#loc110)
61
+ %fixed_33 = arith.select %fixed, %fixed_31, %quot : tensor<4x1xi1, #blocked1>, tensor<4x1xi64, #blocked1> loc(#loc111)
62
+ %fixed_34 = arith.select %fixed_30, %fixed_32, %quot_29 : tensor<4x1xi1, #blocked>, tensor<4x1xi64, #blocked> loc(#loc111)
63
+ %x1 = arith.cmpi slt, %xindex_18, %cst_7 : tensor<4x1xi32, #blocked1> loc(#loc112)
64
+ %x1_35 = arith.cmpi slt, %xindex_19, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc112)
65
+ %x1_36 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc113)
66
+ %x1_37 = tt.splat %x1_36 : i1 -> tensor<4x1xi1, #blocked1> loc(#loc114)
67
+ %x1_38 = tt.splat %x1_36 : i1 -> tensor<4x1xi1, #blocked> loc(#loc114)
68
+ %x1_39 = arith.cmpi ne, %x1, %x1_37 : tensor<4x1xi1, #blocked1> loc(#loc114)
69
+ %x1_40 = arith.cmpi ne, %x1_35, %x1_38 : tensor<4x1xi1, #blocked> loc(#loc114)
70
+ %x1_41 = arith.select %x1_39, %fixed_33, %quot : tensor<4x1xi1, #blocked1>, tensor<4x1xi64, #blocked1> loc(#loc115)
71
+ %x1_42 = arith.select %x1_40, %fixed_34, %quot_29 : tensor<4x1xi1, #blocked>, tensor<4x1xi64, #blocked> loc(#loc115)
72
+ %r0_mask = arith.cmpi slt, %r0_base_23, %cst_8 : tensor<1x128xi32, #blocked1> loc(#loc75)
73
+ %tmp0 = arith.muli %x1_41, %cst_9 : tensor<4x1xi64, #blocked1> loc(#loc76)
74
+ %tmp0_43 = arith.extsi %r0_base_23 : tensor<1x128xi32, #blocked1> to tensor<1x128xi64, #blocked1> loc(#loc77)
75
+ %tmp0_44 = tt.broadcast %tmp0_43 : tensor<1x128xi64, #blocked1> -> tensor<4x128xi64, #blocked1> loc(#loc77)
76
+ %tmp0_45 = tt.broadcast %tmp0 : tensor<4x1xi64, #blocked1> -> tensor<4x128xi64, #blocked1> loc(#loc77)
77
+ %tmp0_46 = arith.addi %tmp0_44, %tmp0_45 : tensor<4x128xi64, #blocked1> loc(#loc77)
78
+ %tmp0_47 = arith.muli %x0_27, %cst_10 : tensor<4x1xi64, #blocked1> loc(#loc78)
79
+ %tmp0_48 = tt.broadcast %tmp0_47 : tensor<4x1xi64, #blocked1> -> tensor<4x128xi64, #blocked1> loc(#loc79)
80
+ %tmp0_49 = arith.addi %tmp0_46, %tmp0_48 : tensor<4x128xi64, #blocked1> loc(#loc79)
81
+ %tmp0_50 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc80)
82
+ %tmp0_51 = tt.addptr %tmp0_50, %tmp0_49 : tensor<4x128x!tt.ptr<bf16>, #blocked1>, tensor<4x128xi64, #blocked1> loc(#loc80)
83
+ %tmp0_52 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked1> -> tensor<4x128xi1, #blocked1> loc(#loc81)
84
+ %tmp0_53 = tt.broadcast %xmask_21 : tensor<4x1xi1, #blocked1> -> tensor<4x128xi1, #blocked1> loc(#loc81)
85
+ %tmp0_54 = arith.andi %tmp0_52, %tmp0_53 : tensor<4x128xi1, #blocked1> loc(#loc81)
86
+ %tmp0_55 = tt.load %tmp0_51, %tmp0_54, %cst_5 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc82)
87
+ %tmp0_56 = arith.extf %tmp0_55 : tensor<4x128xbf16, #blocked1> to tensor<4x128xf32, #blocked1> loc(#loc83)
88
+ %tmp1 = arith.muli %x0_27, %cst_9 : tensor<4x1xi64, #blocked1> loc(#loc84)
89
+ %tmp1_57 = tt.broadcast %tmp1 : tensor<4x1xi64, #blocked1> -> tensor<4x128xi64, #blocked1> loc(#loc85)
90
+ %tmp1_58 = arith.addi %tmp0_44, %tmp1_57 : tensor<4x128xi64, #blocked1> loc(#loc85)
91
+ %tmp1_59 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc86)
92
+ %tmp1_60 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc87)
93
+ %tmp1_61 = arith.extui %tmp1_60 : i1 to i64 loc(#loc88)
94
+ %tmp1_62 = arith.muli %ks0, %tmp1_61 : i64 loc(#loc88)
95
+ %tmp1_63 = arith.extui %tmp1_59 : i1 to i64 loc(#loc116)
96
+ %tmp1_64 = arith.addi %tmp1_63, %tmp1_62 : i64 loc(#loc89)
97
+ %tmp1_65 = tt.splat %tmp1_64 : i64 -> tensor<4x1xi64, #blocked1> loc(#loc91)
98
+ %tmp1_66 = tt.splat %tmp1_64 : i64 -> tensor<4x1xi64, #blocked> loc(#loc91)
99
+ %tmp1_67 = arith.muli %tmp0, %tmp1_65 : tensor<4x1xi64, #blocked1> loc(#loc91)
100
+ %tmp1_68 = tt.broadcast %tmp1_67 : tensor<4x1xi64, #blocked1> -> tensor<4x128xi64, #blocked1> loc(#loc92)
101
+ %tmp1_69 = arith.addi %tmp1_58, %tmp1_68 : tensor<4x128xi64, #blocked1> loc(#loc92)
102
+ %tmp1_70 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc93)
103
+ %tmp1_71 = tt.addptr %tmp1_70, %tmp1_69 : tensor<4x128x!tt.ptr<bf16>, #blocked1>, tensor<4x128xi64, #blocked1> loc(#loc93)
104
+ %tmp1_72 = tt.load %tmp1_71, %tmp0_54, %cst_5 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc94)
105
+ %tmp1_73 = arith.extf %tmp1_72 : tensor<4x128xbf16, #blocked1> to tensor<4x128xf32, #blocked1> loc(#loc95)
106
+ %tmp2 = arith.mulf %tmp0_56, %tmp1_73 : tensor<4x128xf32, #blocked1> loc(#loc96)
107
+ %tmp5 = arith.addf %tmp2, %cst_6 : tensor<4x128xf32, #blocked1> loc(#loc97)
108
+ %_tmp4 = arith.select %tmp0_54, %tmp5, %cst_6 : tensor<4x128xi1, #blocked1>, tensor<4x128xf32, #blocked1> loc(#loc98)
109
+ %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
110
+ ^bb0(%tmp4_80: f32 loc(callsite(#loc1 at #loc99)), %tmp4_81: f32 loc(callsite(#loc1 at #loc99))):
111
+ %tmp4_82 = arith.addf %tmp4_80, %tmp4_81 : f32 loc(#loc119)
112
+ tt.reduce.return %tmp4_82 : f32 loc(#loc117)
113
+ }) : (tensor<4x128xf32, #blocked1>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc117)
114
+ %tmp12 = ttg.convert_layout %tmp4 : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc100)
115
+ %tmp4_74 = tt.expand_dims %tmp12 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xf32, #blocked> loc(#loc101)
116
+ %tmp7 = arith.muli %x1_42, %tmp1_66 : tensor<4x1xi64, #blocked> loc(#loc102)
117
+ %tmp7_75 = arith.addi %x0_28, %tmp7 : tensor<4x1xi64, #blocked> loc(#loc103)
118
+ %tmp7_76 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>, #blocked> loc(#loc104)
119
+ %tmp7_77 = tt.addptr %tmp7_76, %tmp7_75 : tensor<4x1x!tt.ptr<f32>, #blocked>, tensor<4x1xi64, #blocked> loc(#loc104)
120
+ %tmp7_78 = tt.load %tmp7_77, %xmask_22 evictionPolicy = evict_last : tensor<4x1x!tt.ptr<f32>, #blocked> loc(#loc105)
121
+ %tmp9 = arith.mulf %tmp7_78, %cst : tensor<4x1xf32, #blocked> loc(#loc106)
122
+ %tmp11 = arith.mulf %tmp9, %cst_0 : tensor<4x1xf32, #blocked> loc(#loc107)
123
+ %tmp12_79 = arith.subf %tmp4_74, %tmp11 : tensor<4x1xf32, #blocked> loc(#loc100)
124
+ %0 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>, #blocked> loc(#loc53)
125
+ %1 = tt.addptr %0, %xindex_19 : tensor<4x1x!tt.ptr<f32>, #blocked>, tensor<4x1xi32, #blocked> loc(#loc53)
126
+ tt.store %1, %tmp12_79, %xmask_22 : tensor<4x1x!tt.ptr<f32>, #blocked> loc(#loc54)
127
+ tt.return loc(#loc55)
128
+ } loc(#loc)
129
+ } loc(#loc)
130
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":22:28)
131
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":22:33)
132
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":23:44)
133
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":23:23)
134
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":24:21)
135
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":25:37)
136
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":27:19)
137
+ #loc9 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
138
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":28:51)
139
+ #loc11 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
140
+ #loc12 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
141
+ #loc13 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
142
+ #loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
143
+ #loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
144
+ #loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
145
+ #loc17 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
146
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":33:29)
147
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:45)
148
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:41)
149
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:55)
150
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:50)
151
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:34)
152
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:70)
153
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:60)
154
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:122)
155
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:45)
156
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:41)
157
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:73)
158
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:99)
159
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:90)
160
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:81)
161
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:65)
162
+ #loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:58)
163
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:50)
164
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:34)
165
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:106)
166
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:168)
167
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":39:22)
168
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":41:23)
169
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":42:48)
170
+ #loc42 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
171
+ #loc44 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
172
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":50:19)
173
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":43:28)
174
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:39)
175
+ #loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:35)
176
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:30)
177
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:87)
178
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":47:18)
179
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":49:19)
180
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":51:25)
181
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":51:37)
182
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":51:4)
183
+ #loc63 = loc("xoffset"(#loc2))
184
+ #loc64 = loc("xoffset"(#loc3))
185
+ #loc65 = loc("xindex"(#loc4))
186
+ #loc66 = loc("xindex"(#loc5))
187
+ #loc67 = loc("xmask"(#loc6))
188
+ #loc68 = loc("r0_base"(#loc7))
189
+ #loc69 = loc("x0"(#loc8))
190
+ #loc70 = loc("quot"(#loc9))
191
+ #loc71 = loc("x1"(#loc10))
192
+ #loc72 = loc("fixed"(#loc11))
193
+ #loc73 = loc("fixed"(#loc12))
194
+ #loc74 = loc("fixed"(#loc13))
195
+ #loc75 = loc("r0_mask"(#loc18))
196
+ #loc76 = loc("tmp0"(#loc19))
197
+ #loc77 = loc("tmp0"(#loc20))
198
+ #loc78 = loc("tmp0"(#loc21))
199
+ #loc79 = loc("tmp0"(#loc22))
200
+ #loc80 = loc("tmp0"(#loc23))
201
+ #loc81 = loc("tmp0"(#loc24))
202
+ #loc82 = loc("tmp0"(#loc25))
203
+ #loc83 = loc("tmp0"(#loc26))
204
+ #loc84 = loc("tmp1"(#loc27))
205
+ #loc85 = loc("tmp1"(#loc28))
206
+ #loc86 = loc("tmp1"(#loc29))
207
+ #loc87 = loc("tmp1"(#loc30))
208
+ #loc88 = loc("tmp1"(#loc31))
209
+ #loc89 = loc("tmp1"(#loc32))
210
+ #loc90 = loc("tmp1"(#loc33))
211
+ #loc91 = loc("tmp1"(#loc34))
212
+ #loc92 = loc("tmp1"(#loc35))
213
+ #loc93 = loc("tmp1"(#loc36))
214
+ #loc94 = loc("tmp1"(#loc37))
215
+ #loc95 = loc("tmp1"(#loc38))
216
+ #loc96 = loc("tmp2"(#loc39))
217
+ #loc97 = loc("tmp5"(#loc40))
218
+ #loc98 = loc("_tmp4"(#loc41))
219
+ #loc100 = loc("tmp12"(#loc45))
220
+ #loc101 = loc("tmp4"(#loc46))
221
+ #loc102 = loc("tmp7"(#loc47))
222
+ #loc103 = loc("tmp7"(#loc48))
223
+ #loc104 = loc("tmp7"(#loc49))
224
+ #loc105 = loc("tmp7"(#loc50))
225
+ #loc106 = loc("tmp9"(#loc51))
226
+ #loc107 = loc("tmp11"(#loc52))
227
+ #loc108 = loc(callsite(#loc70 at #loc71))
228
+ #loc109 = loc(callsite(#loc72 at #loc71))
229
+ #loc110 = loc(callsite(#loc73 at #loc71))
230
+ #loc111 = loc(callsite(#loc74 at #loc71))
231
+ #loc112 = loc(callsite(#loc14 at #loc71))
232
+ #loc113 = loc(callsite(#loc15 at #loc71))
233
+ #loc114 = loc(callsite(#loc16 at #loc71))
234
+ #loc115 = loc(callsite(#loc17 at #loc71))
235
+ #loc116 = loc(fused[#loc89, #loc90])
236
+ #loc117 = loc(callsite(#loc42 at #loc99))
237
+ #loc119 = loc(callsite(#loc44 at #loc117))
progress/github/SpecForge/cache/compiled_kernels/triton/3/7ER3AVTZOT7CXEBCFLOGF5JGIU47K65LHKLEWCY3SCFPJHJ6GTWQ/triton_red_fused_mul_0.ttir ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":18:0)
2
+ #loc6 = loc(unknown)
3
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":43:25)
4
+ #loc58 = loc("in_ptr0"(#loc))
5
+ #loc59 = loc("in_ptr1"(#loc))
6
+ #loc60 = loc("in_ptr2"(#loc))
7
+ #loc61 = loc("out_ptr1"(#loc))
8
+ #loc62 = loc("ks0"(#loc))
9
+ #loc63 = loc("xnumel"(#loc))
10
+ #loc64 = loc("r0_numel"(#loc))
11
+ #loc105 = loc("tmp4"(#loc47))
12
+ #loc122 = loc(callsite(#loc6 at #loc105))
13
+ module {
14
+ tt.func public @triton_red_fused_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
15
+ %fixed = arith.constant dense<1> : tensor<4x1xi64> loc(#loc112)
16
+ %x1 = arith.constant dense<0> : tensor<4x1xi32> loc(#loc113)
17
+ %fixed_0 = arith.constant dense<0> : tensor<4x1xi64> loc(#loc114)
18
+ %x1_1 = arith.constant 0 : i64 loc(#loc115)
19
+ %c1_i64 = arith.constant 1 : i64 loc(#loc6)
20
+ %cst = arith.constant dense<0.000000e+00> : tensor<4x128xbf16> loc(#loc6)
21
+ %tmp11 = arith.constant dense<1.44269502> : tensor<4x1xf32> loc(#loc68)
22
+ %tmp9 = arith.constant dense<0.693147182> : tensor<4x1xf32> loc(#loc69)
23
+ %cst_2 = arith.constant dense<4096> : tensor<4x1xi64> loc(#loc6)
24
+ %cst_3 = arith.constant dense<128> : tensor<4x1xi64> loc(#loc6)
25
+ %cst_4 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc6)
26
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc6)
27
+ %c4_i32 = arith.constant 4 : i32 loc(#loc6)
28
+ %xoffset = tt.get_program_id x : i32 loc(#loc70)
29
+ %xoffset_6 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc71)
30
+ %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc72)
31
+ %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc73)
32
+ %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<4x1xi32> loc(#loc74)
33
+ %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<4x1xi32> loc(#loc74)
34
+ %xmask = tt.splat %xnumel : i32 -> tensor<4x1xi32> loc(#loc75)
35
+ %xmask_10 = arith.cmpi slt, %xindex_9, %xmask : tensor<4x1xi32> loc(#loc75)
36
+ %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc76)
37
+ %r0_base_11 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc77)
38
+ %x0 = arith.extsi %xindex_9 : tensor<4x1xi32> to tensor<4x1xi64> loc(#loc78)
39
+ %x0_12 = tt.splat %ks0 : i64 -> tensor<4x1xi64> loc(#loc78)
40
+ %x0_13 = arith.remsi %x0, %x0_12 : tensor<4x1xi64> loc(#loc78)
41
+ %quot = arith.divsi %x0, %x0_12 : tensor<4x1xi64> loc(#loc116)
42
+ %fixed_14 = arith.cmpi ne, %x0_13, %fixed_0 : tensor<4x1xi64> loc(#loc114)
43
+ %fixed_15 = arith.subi %quot, %fixed : tensor<4x1xi64> loc(#loc112)
44
+ %fixed_16 = arith.select %fixed_14, %fixed_15, %quot : tensor<4x1xi1>, tensor<4x1xi64> loc(#loc117)
45
+ %x1_17 = arith.cmpi slt, %xindex_9, %x1 : tensor<4x1xi32> loc(#loc113)
46
+ %x1_18 = arith.cmpi slt, %ks0, %x1_1 : i64 loc(#loc115)
47
+ %x1_19 = tt.splat %x1_18 : i1 -> tensor<4x1xi1> loc(#loc118)
48
+ %x1_20 = arith.cmpi ne, %x1_17, %x1_19 : tensor<4x1xi1> loc(#loc118)
49
+ %x1_21 = arith.select %x1_20, %fixed_16, %quot : tensor<4x1xi1>, tensor<4x1xi64> loc(#loc119)
50
+ %r0_mask = arith.cmpi slt, %r0_base_11, %cst_4 : tensor<1x128xi32> loc(#loc81)
51
+ %tmp0 = arith.muli %x1_21, %cst_3 : tensor<4x1xi64> loc(#loc82)
52
+ %tmp0_22 = arith.extsi %r0_base_11 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc83)
53
+ %tmp0_23 = tt.broadcast %tmp0_22 : tensor<1x128xi64> -> tensor<4x128xi64> loc(#loc83)
54
+ %tmp0_24 = tt.broadcast %tmp0 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc83)
55
+ %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<4x128xi64> loc(#loc83)
56
+ %tmp0_26 = arith.muli %x0_13, %cst_2 : tensor<4x1xi64> loc(#loc84)
57
+ %tmp0_27 = tt.broadcast %tmp0_26 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc85)
58
+ %tmp0_28 = arith.addi %tmp0_25, %tmp0_27 : tensor<4x128xi64> loc(#loc85)
59
+ %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc86)
60
+ %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi64> loc(#loc86)
61
+ %tmp0_31 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc87)
62
+ %tmp0_32 = tt.broadcast %xmask_10 : tensor<4x1xi1> -> tensor<4x128xi1> loc(#loc87)
63
+ %tmp0_33 = arith.andi %tmp0_31, %tmp0_32 : tensor<4x128xi1> loc(#loc87)
64
+ %tmp0_34 = tt.load %tmp0_30, %tmp0_33, %cst evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>> loc(#loc88)
65
+ %tmp0_35 = arith.extf %tmp0_34 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc89)
66
+ %tmp1 = arith.muli %x0_13, %cst_3 : tensor<4x1xi64> loc(#loc90)
67
+ %tmp1_36 = tt.broadcast %tmp1 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc91)
68
+ %tmp1_37 = arith.addi %tmp0_23, %tmp1_36 : tensor<4x128xi64> loc(#loc91)
69
+ %tmp1_38 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc92)
70
+ %tmp1_39 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc93)
71
+ %tmp1_40 = arith.extui %tmp1_39 : i1 to i64 loc(#loc94)
72
+ %tmp1_41 = arith.muli %ks0, %tmp1_40 : i64 loc(#loc94)
73
+ %tmp1_42 = arith.extui %tmp1_38 : i1 to i64 loc(#loc120)
74
+ %tmp1_43 = arith.addi %tmp1_42, %tmp1_41 : i64 loc(#loc95)
75
+ %tmp1_44 = tt.splat %tmp1_43 : i64 -> tensor<4x1xi64> loc(#loc97)
76
+ %tmp1_45 = arith.muli %tmp0, %tmp1_44 : tensor<4x1xi64> loc(#loc97)
77
+ %tmp1_46 = tt.broadcast %tmp1_45 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc98)
78
+ %tmp1_47 = arith.addi %tmp1_37, %tmp1_46 : tensor<4x128xi64> loc(#loc98)
79
+ %tmp1_48 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc99)
80
+ %tmp1_49 = tt.addptr %tmp1_48, %tmp1_47 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi64> loc(#loc99)
81
+ %tmp1_50 = tt.load %tmp1_49, %tmp0_33, %cst evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>> loc(#loc100)
82
+ %tmp1_51 = arith.extf %tmp1_50 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc101)
83
+ %tmp2 = arith.mulf %tmp0_35, %tmp1_51 : tensor<4x128xf32> loc(#loc102)
84
+ %tmp5 = arith.addf %tmp2, %cst_5 : tensor<4x128xf32> loc(#loc103)
85
+ %_tmp4 = arith.select %tmp0_33, %tmp5, %cst_5 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc104)
86
+ %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
87
+ ^bb0(%tmp4_59: f32 loc(callsite(#loc6 at #loc105)), %tmp4_60: f32 loc(callsite(#loc6 at #loc105))):
88
+ %tmp4_61 = arith.addf %tmp4_59, %tmp4_60 : f32 loc(#loc123)
89
+ tt.reduce.return %tmp4_61 : f32 loc(#loc121)
90
+ }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc121)
91
+ %tmp4_52 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc106)
92
+ %tmp7 = arith.muli %x1_21, %tmp1_44 : tensor<4x1xi64> loc(#loc107)
93
+ %tmp7_53 = arith.addi %x0_13, %tmp7 : tensor<4x1xi64> loc(#loc108)
94
+ %tmp7_54 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>> loc(#loc109)
95
+ %tmp7_55 = tt.addptr %tmp7_54, %tmp7_53 : tensor<4x1x!tt.ptr<f32>>, tensor<4x1xi64> loc(#loc109)
96
+ %tmp7_56 = tt.load %tmp7_55, %xmask_10 evictionPolicy = evict_last : tensor<4x1x!tt.ptr<f32>> loc(#loc110)
97
+ %tmp9_57 = arith.mulf %tmp7_56, %tmp9 : tensor<4x1xf32> loc(#loc69)
98
+ %tmp11_58 = arith.mulf %tmp9_57, %tmp11 : tensor<4x1xf32> loc(#loc68)
99
+ %tmp12 = arith.subf %tmp4_52, %tmp11_58 : tensor<4x1xf32> loc(#loc111)
100
+ %0 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>> loc(#loc55)
101
+ %1 = tt.addptr %0, %xindex_9 : tensor<4x1x!tt.ptr<f32>>, tensor<4x1xi32> loc(#loc55)
102
+ tt.store %1, %tmp12, %xmask_10 : tensor<4x1x!tt.ptr<f32>> loc(#loc56)
103
+ tt.return loc(#loc57)
104
+ } loc(#loc)
105
+ } loc(#loc)
106
+ #loc1 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
107
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":28:51)
108
+ #loc3 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
109
+ #loc4 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
110
+ #loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
111
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":49:19)
112
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":47:18)
113
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":22:28)
114
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":22:33)
115
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":23:36)
116
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":23:44)
117
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":23:23)
118
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":24:21)
119
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":25:27)
120
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":25:37)
121
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":27:19)
122
+ #loc18 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
123
+ #loc19 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
124
+ #loc20 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
125
+ #loc21 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
126
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":33:29)
127
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:45)
128
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:41)
129
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:55)
130
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:50)
131
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:34)
132
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:70)
133
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:60)
134
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":37:122)
135
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:45)
136
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:41)
137
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:73)
138
+ #loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:99)
139
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:90)
140
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:81)
141
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:65)
142
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:58)
143
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:50)
144
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:34)
145
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:106)
146
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":38:168)
147
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":39:22)
148
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":41:23)
149
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":42:48)
150
+ #loc46 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
151
+ #loc48 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
152
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":43:28)
153
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:39)
154
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:35)
155
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:30)
156
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":44:87)
157
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":50:19)
158
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":51:25)
159
+ #loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":51:37)
160
+ #loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/3q/c3qbvcsx2w7qss2v3eocuadgz6t35joo33bflzqkxzzj747zcjpk.py":51:4)
161
+ #loc65 = loc("fixed"(#loc1))
162
+ #loc66 = loc("x1"(#loc2))
163
+ #loc67 = loc("fixed"(#loc4))
164
+ #loc68 = loc("tmp11"(#loc7))
165
+ #loc69 = loc("tmp9"(#loc8))
166
+ #loc70 = loc("xoffset"(#loc9))
167
+ #loc71 = loc("xoffset"(#loc10))
168
+ #loc72 = loc("xindex"(#loc11))
169
+ #loc73 = loc("xindex"(#loc12))
170
+ #loc74 = loc("xindex"(#loc13))
171
+ #loc75 = loc("xmask"(#loc14))
172
+ #loc76 = loc("r0_base"(#loc15))
173
+ #loc77 = loc("r0_base"(#loc16))
174
+ #loc78 = loc("x0"(#loc17))
175
+ #loc79 = loc("quot"(#loc18))
176
+ #loc80 = loc("fixed"(#loc19))
177
+ #loc81 = loc("r0_mask"(#loc22))
178
+ #loc82 = loc("tmp0"(#loc23))
179
+ #loc83 = loc("tmp0"(#loc24))
180
+ #loc84 = loc("tmp0"(#loc25))
181
+ #loc85 = loc("tmp0"(#loc26))
182
+ #loc86 = loc("tmp0"(#loc27))
183
+ #loc87 = loc("tmp0"(#loc28))
184
+ #loc88 = loc("tmp0"(#loc29))
185
+ #loc89 = loc("tmp0"(#loc30))
186
+ #loc90 = loc("tmp1"(#loc31))
187
+ #loc91 = loc("tmp1"(#loc32))
188
+ #loc92 = loc("tmp1"(#loc33))
189
+ #loc93 = loc("tmp1"(#loc34))
190
+ #loc94 = loc("tmp1"(#loc35))
191
+ #loc95 = loc("tmp1"(#loc36))
192
+ #loc96 = loc("tmp1"(#loc37))
193
+ #loc97 = loc("tmp1"(#loc38))
194
+ #loc98 = loc("tmp1"(#loc39))
195
+ #loc99 = loc("tmp1"(#loc40))
196
+ #loc100 = loc("tmp1"(#loc41))
197
+ #loc101 = loc("tmp1"(#loc42))
198
+ #loc102 = loc("tmp2"(#loc43))
199
+ #loc103 = loc("tmp5"(#loc44))
200
+ #loc104 = loc("_tmp4"(#loc45))
201
+ #loc106 = loc("tmp4"(#loc49))
202
+ #loc107 = loc("tmp7"(#loc50))
203
+ #loc108 = loc("tmp7"(#loc51))
204
+ #loc109 = loc("tmp7"(#loc52))
205
+ #loc110 = loc("tmp7"(#loc53))
206
+ #loc111 = loc("tmp12"(#loc54))
207
+ #loc112 = loc(callsite(#loc65 at #loc66))
208
+ #loc113 = loc(callsite(#loc3 at #loc66))
209
+ #loc114 = loc(callsite(#loc67 at #loc66))
210
+ #loc115 = loc(callsite(#loc5 at #loc66))
211
+ #loc116 = loc(callsite(#loc79 at #loc66))
212
+ #loc117 = loc(callsite(#loc80 at #loc66))
213
+ #loc118 = loc(callsite(#loc20 at #loc66))
214
+ #loc119 = loc(callsite(#loc21 at #loc66))
215
+ #loc120 = loc(fused[#loc95, #loc96])
216
+ #loc121 = loc(callsite(#loc46 at #loc105))
217
+ #loc123 = loc(callsite(#loc48 at #loc121))
progress/github/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/__grp__triton_tem_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_tem_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.source", "triton_tem_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.ttir", "triton_tem_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.ttgir", "triton_tem_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.llir", "triton_tem_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.ptx", "triton_tem_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.cubin", "triton_tem_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.json"}}
progress/github/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "fdf02639e61a052085bc6f3d4b3b415d01ac9f98c2dda2f65525dae22fcdaa63", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 164864, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_mul_1"}
progress/github/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.llir ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.ptx ADDED
The diff for this file is too large to render. See raw diff
 
progress/github/SpecForge/cache/compiled_kernels/triton/3/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.source ADDED
The diff for this file is too large to render. See raw diff